diff --git a/src/common/recordbatch/src/recordbatch.rs b/src/common/recordbatch/src/recordbatch.rs index b768a2f0bcc8..5fc886f8b9d1 100644 --- a/src/common/recordbatch/src/recordbatch.rs +++ b/src/common/recordbatch/src/recordbatch.rs @@ -23,6 +23,7 @@ use snafu::ResultExt; use crate::error::{self, Result}; +// TODO(yingwen): We should hold vectors in the RecordBatch. #[derive(Clone, Debug, PartialEq)] pub struct RecordBatch { pub schema: SchemaRef, @@ -103,6 +104,7 @@ impl<'a> Iterator for RecordBatchRowIterator<'a> { } else { let mut row = Vec::with_capacity(self.columns); + // TODO(yingwen): Get from the vector if RecordBatch also holds vectors. for col in 0..self.columns { let column_array = self.record_batch.df_recordbatch.column(col); match arrow_array_get(column_array.as_ref(), self.row_cursor) diff --git a/src/datatypes2/src/arrow_array.rs b/src/datatypes2/src/arrow_array.rs index dcd9e4bd7f62..7405c8a665af 100644 --- a/src/datatypes2/src/arrow_array.rs +++ b/src/datatypes2/src/arrow_array.rs @@ -12,14 +12,19 @@ // See the License for the specific language governing permissions and // limitations under the License. -use arrow::array::Array; +use arrow::array::{ + Array, BooleanArray, Date32Array, Date64Array, Float32Array, Float64Array, Int16Array, + Int32Array, Int64Array, Int8Array, ListArray, UInt16Array, UInt32Array, UInt64Array, + UInt8Array, +}; use arrow::datatypes::DataType; use common_time::timestamp::TimeUnit; use common_time::Timestamp; use snafu::OptionExt; +use crate::data_type::ConcreteDataType; use crate::error::{ConversionSnafu, Result}; -use crate::value::Value; +use crate::value::{ListValue, Value}; pub type BinaryArray = arrow::array::LargeBinaryArray; pub type MutableBinaryArray = arrow::array::LargeBinaryBuilder; @@ -36,6 +41,7 @@ macro_rules! cast_array { }; } +// TODO(yingwen): Remove this function. pub fn arrow_array_get(array: &dyn Array, idx: usize) -> Result { if array.is_null(idx) { return Ok(Value::Null); @@ -43,27 +49,21 @@ pub fn arrow_array_get(array: &dyn Array, idx: usize) -> Result { let result = match array.data_type() { DataType::Null => Value::Null, - // DataType::Boolean => Value::Boolean(cast_array!(array, array::BooleanArray).value(idx)), - DataType::Binary | DataType::LargeBinary => { - Value::Binary(cast_array!(array, BinaryArray).value(idx).into()) - } - // DataType::Int8 => Value::Int8(cast_array!(array, PrimitiveArray::).value(idx)), - // DataType::Int16 => Value::Int16(cast_array!(array, PrimitiveArray::).value(idx)), - // DataType::Int32 => Value::Int32(cast_array!(array, PrimitiveArray::).value(idx)), - // DataType::Int64 => Value::Int64(cast_array!(array, PrimitiveArray::).value(idx)), - // DataType::UInt8 => Value::UInt8(cast_array!(array, PrimitiveArray::).value(idx)), - // DataType::UInt16 => Value::UInt16(cast_array!(array, PrimitiveArray::).value(idx)), - // DataType::UInt32 => Value::UInt32(cast_array!(array, PrimitiveArray::).value(idx)), - // DataType::UInt64 => Value::UInt64(cast_array!(array, PrimitiveArray::).value(idx)), - // DataType::Float32 => { - // Value::Float32(cast_array!(array, PrimitiveArray::).value(idx).into()) - // } - // DataType::Float64 => { - // Value::Float64(cast_array!(array, PrimitiveArray::).value(idx).into()) - // } - // DataType::Utf8 | DataType::LargeUtf8 => { - // Value::String(cast_array!(array, StringArray).value(idx).into()) - // } + DataType::Boolean => Value::Boolean(cast_array!(array, BooleanArray).value(idx)), + DataType::Binary => Value::Binary(cast_array!(array, BinaryArray).value(idx).into()), + DataType::Int8 => Value::Int8(cast_array!(array, Int8Array).value(idx)), + DataType::Int16 => Value::Int16(cast_array!(array, Int16Array).value(idx)), + DataType::Int32 => Value::Int32(cast_array!(array, Int32Array).value(idx)), + DataType::Int64 => Value::Int64(cast_array!(array, Int64Array).value(idx)), + DataType::UInt8 => Value::UInt8(cast_array!(array, UInt8Array).value(idx)), + DataType::UInt16 => Value::UInt16(cast_array!(array, UInt16Array).value(idx)), + DataType::UInt32 => Value::UInt32(cast_array!(array, UInt32Array).value(idx)), + DataType::UInt64 => Value::UInt64(cast_array!(array, UInt64Array).value(idx)), + DataType::Float32 => Value::Float32(cast_array!(array, Float32Array).value(idx).into()), + DataType::Float64 => Value::Float64(cast_array!(array, Float64Array).value(idx).into()), + DataType::Utf8 => Value::String(cast_array!(array, StringArray).value(idx).into()), + DataType::Date32 => Value::Date(cast_array!(array, Date32Array).value(idx).into()), + DataType::Date64 => Value::DateTime(cast_array!(array, Date64Array).value(idx).into()), DataType::Timestamp(t, _) => match t { arrow::datatypes::TimeUnit::Second => Value::Timestamp(Timestamp::new( cast_array!(array, arrow::array::TimestampSecondArray).value(idx), @@ -82,158 +82,36 @@ pub fn arrow_array_get(array: &dyn Array, idx: usize) -> Result { TimeUnit::Nanosecond, )), }, - // DataType::List(_) => { - // let array = cast_array!(array, ListArray::).value(idx); - // let inner_datatype = ConcreteDataType::try_from(array.data_type())?; - // let values = (0..array.len()) - // .map(|i| arrow_array_get(&*array, i)) - // .collect::>>()?; - // Value::List(ListValue::new(Some(Box::new(values)), inner_datatype)) - // } + DataType::List(_) => { + let array = cast_array!(array, ListArray).value(idx); + let item_type = ConcreteDataType::try_from(array.data_type())?; + let values = (0..array.len()) + .map(|i| arrow_array_get(&*array, i)) + .collect::>>()?; + Value::List(ListValue::new(Some(Box::new(values)), item_type)) + } _ => unimplemented!("Arrow array datatype: {:?}", array.data_type()), }; Ok(result) } -// #[cfg(test)] -// mod test { -// use arrow::array::{ -// BooleanArray, Float32Array, Float64Array, Int16Array, Int32Array, Int64Array, Int8Array, -// UInt16Array, UInt32Array, UInt64Array, -// UInt8Array, -// }; -// use arrow::buffer::Buffer; -// use arrow::datatypes::{DataType, TimeUnit as ArrowTimeUnit}; -// use common_time::timestamp::{TimeUnit, Timestamp}; - -// use super::*; -// use crate::prelude::Vector; -// use crate::vectors::TimestampVector; - -// #[test] -// fn test_arrow_array_access() { -// let array1 = BooleanArray::from_slice(vec![true, true, false, false]); -// assert_eq!(Value::Boolean(true), arrow_array_get(&array1, 1).unwrap()); -// let array1 = Int8Array::from_vec(vec![1, 2, 3, 4]); -// assert_eq!(Value::Int8(2), arrow_array_get(&array1, 1).unwrap()); -// let array1 = UInt8Array::from_vec(vec![1, 2, 3, 4]); -// assert_eq!(Value::UInt8(2), arrow_array_get(&array1, 1).unwrap()); -// let array1 = Int16Array::from_vec(vec![1, 2, 3, 4]); -// assert_eq!(Value::Int16(2), arrow_array_get(&array1, 1).unwrap()); -// let array1 = UInt16Array::from_vec(vec![1, 2, 3, 4]); -// assert_eq!(Value::UInt16(2), arrow_array_get(&array1, 1).unwrap()); -// let array1 = Int32Array::from_vec(vec![1, 2, 3, 4]); -// assert_eq!(Value::Int32(2), arrow_array_get(&array1, 1).unwrap()); -// let array1 = UInt32Array::from_vec(vec![1, 2, 3, 4]); -// assert_eq!(Value::UInt32(2), arrow_array_get(&array1, 1).unwrap()); -// let array = Int64Array::from_vec(vec![1, 2, 3, 4]); -// assert_eq!(Value::Int64(2), arrow_array_get(&array, 1).unwrap()); -// let array1 = UInt64Array::from_vec(vec![1, 2, 3, 4]); -// assert_eq!(Value::UInt64(2), arrow_array_get(&array1, 1).unwrap()); -// let array1 = Float32Array::from_vec(vec![1f32, 2f32, 3f32, 4f32]); -// assert_eq!( -// Value::Float32(2f32.into()), -// arrow_array_get(&array1, 1).unwrap() -// ); -// let array1 = Float64Array::from_vec(vec![1f64, 2f64, 3f64, 4f64]); -// assert_eq!( -// Value::Float64(2f64.into()), -// arrow_array_get(&array1, 1).unwrap() -// ); - -// let array2 = StringArray::from(vec![Some("hello"), None, Some("world")]); -// assert_eq!( -// Value::String("hello".into()), -// arrow_array_get(&array2, 0).unwrap() -// ); -// assert_eq!(Value::Null, arrow_array_get(&array2, 1).unwrap()); - -// let array3 = super::BinaryArray::from(vec![ -// Some("hello".as_bytes()), -// None, -// Some("world".as_bytes()), -// ]); -// assert_eq!( -// Value::Binary("hello".as_bytes().into()), -// arrow_array_get(&array3, 0).unwrap() -// ); -// assert_eq!(Value::Null, arrow_array_get(&array3, 1).unwrap()); - -// let vector = TimestampVector::new(Int64Array::from_vec(vec![1, 2, 3, 4])); -// let array = vector.to_boxed_arrow_array(); -// let value = arrow_array_get(&*array, 1).unwrap(); -// assert_eq!( -// value, -// Value::Timestamp(Timestamp::new(2, TimeUnit::Millisecond)) -// ); - -// let array4 = PrimitiveArray::::from_data( -// DataType::Timestamp(ArrowTimeUnit::Millisecond, None), -// Buffer::from_slice(&vec![1, 2, 3, 4]), -// None, -// ); -// assert_eq!( -// Value::Timestamp(Timestamp::new(1, TimeUnit::Millisecond)), -// arrow_array_get(&array4, 0).unwrap() -// ); - -// let array4 = PrimitiveArray::::from_data( -// DataType::Timestamp(ArrowTimeUnit::Nanosecond, None), -// Buffer::from_slice(&vec![1, 2, 3, 4]), -// None, -// ); -// assert_eq!( -// Value::Timestamp(Timestamp::new(1, TimeUnit::Nanosecond)), -// arrow_array_get(&array4, 0).unwrap() -// ); - -// // test list array -// // FIXME(yingwen): Fix this test. -// // let data = vec![ -// // Some(vec![Some(1i32), Some(2), Some(3)]), -// // None, -// // Some(vec![Some(4), None, Some(6)]), -// // ]; - -// // let mut arrow_array = MutableListArray::>::new(); -// // arrow_array.try_extend(data).unwrap(); -// // let arrow_array: ListArray = arrow_array.into(); - -// // let v0 = arrow_array_get(&arrow_array, 0).unwrap(); -// // match v0 { -// // Value::List(list) => { -// // assert!(matches!(list.datatype(), ConcreteDataType::Int32(_))); -// // let items = list.items().as_ref().unwrap(); -// // assert_eq!( -// // **items, -// // vec![Value::Int32(1), Value::Int32(2), Value::Int32(3)] -// // ); -// // } -// // _ => unreachable!(), -// // } - -// // assert_eq!(Value::Null, arrow_array_get(&arrow_array, 1).unwrap()); -// // let v2 = arrow_array_get(&arrow_array, 2).unwrap(); -// // match v2 { -// // Value::List(list) => { -// // assert!(matches!(list.datatype(), ConcreteDataType::Int32(_))); -// // let items = list.items().as_ref().unwrap(); -// // assert_eq!(**items, vec![Value::Int32(4), Value::Null, Value::Int32(6)]); -// // } -// // _ => unreachable!(), -// // } -// } -// } - #[cfg(test)] -mod tests { +mod test { use std::sync::Arc; + use arrow::array::{ + BooleanArray, Float32Array, Float64Array, Int16Array, Int32Array, Int64Array, Int8Array, + LargeBinaryArray, TimestampMicrosecondArray, TimestampMillisecondArray, + TimestampNanosecondArray, TimestampSecondArray, UInt16Array, UInt32Array, UInt64Array, + UInt8Array, + }; + use arrow::datatypes::Int32Type; + use common_time::timestamp::{TimeUnit, Timestamp}; use paste::paste; use super::*; - use crate::prelude::ConcreteDataType; + use crate::data_type::ConcreteDataType; use crate::types::TimestampType; macro_rules! test_arrow_array_get_for_timestamps { @@ -261,4 +139,104 @@ mod tests { fn test_timestamp_array() { test_arrow_array_get_for_timestamps![Second, Millisecond, Microsecond, Nanosecond]; } + + #[test] + fn test_arrow_array_access() { + let array1 = BooleanArray::from(vec![true, true, false, false]); + assert_eq!(Value::Boolean(true), arrow_array_get(&array1, 1).unwrap()); + let array1 = Int8Array::from(vec![1, 2, 3, 4]); + assert_eq!(Value::Int8(2), arrow_array_get(&array1, 1).unwrap()); + let array1 = UInt8Array::from(vec![1, 2, 3, 4]); + assert_eq!(Value::UInt8(2), arrow_array_get(&array1, 1).unwrap()); + let array1 = Int16Array::from(vec![1, 2, 3, 4]); + assert_eq!(Value::Int16(2), arrow_array_get(&array1, 1).unwrap()); + let array1 = UInt16Array::from(vec![1, 2, 3, 4]); + assert_eq!(Value::UInt16(2), arrow_array_get(&array1, 1).unwrap()); + let array1 = Int32Array::from(vec![1, 2, 3, 4]); + assert_eq!(Value::Int32(2), arrow_array_get(&array1, 1).unwrap()); + let array1 = UInt32Array::from(vec![1, 2, 3, 4]); + assert_eq!(Value::UInt32(2), arrow_array_get(&array1, 1).unwrap()); + let array = Int64Array::from(vec![1, 2, 3, 4]); + assert_eq!(Value::Int64(2), arrow_array_get(&array, 1).unwrap()); + let array1 = UInt64Array::from(vec![1, 2, 3, 4]); + assert_eq!(Value::UInt64(2), arrow_array_get(&array1, 1).unwrap()); + let array1 = Float32Array::from(vec![1f32, 2f32, 3f32, 4f32]); + assert_eq!( + Value::Float32(2f32.into()), + arrow_array_get(&array1, 1).unwrap() + ); + let array1 = Float64Array::from(vec![1f64, 2f64, 3f64, 4f64]); + assert_eq!( + Value::Float64(2f64.into()), + arrow_array_get(&array1, 1).unwrap() + ); + + let array2 = StringArray::from(vec![Some("hello"), None, Some("world")]); + assert_eq!( + Value::String("hello".into()), + arrow_array_get(&array2, 0).unwrap() + ); + assert_eq!(Value::Null, arrow_array_get(&array2, 1).unwrap()); + + let array3 = LargeBinaryArray::from(vec![ + Some("hello".as_bytes()), + None, + Some("world".as_bytes()), + ]); + assert_eq!(Value::Null, arrow_array_get(&array3, 1).unwrap()); + + let array = TimestampSecondArray::from(vec![1, 2, 3]); + let value = arrow_array_get(&array, 1).unwrap(); + assert_eq!(value, Value::Timestamp(Timestamp::new(2, TimeUnit::Second))); + let array = TimestampMillisecondArray::from(vec![1, 2, 3]); + let value = arrow_array_get(&array, 1).unwrap(); + assert_eq!( + value, + Value::Timestamp(Timestamp::new(2, TimeUnit::Millisecond)) + ); + let array = TimestampMicrosecondArray::from(vec![1, 2, 3]); + let value = arrow_array_get(&array, 1).unwrap(); + assert_eq!( + value, + Value::Timestamp(Timestamp::new(2, TimeUnit::Microsecond)) + ); + let array = TimestampNanosecondArray::from(vec![1, 2, 3]); + let value = arrow_array_get(&array, 1).unwrap(); + assert_eq!( + value, + Value::Timestamp(Timestamp::new(2, TimeUnit::Nanosecond)) + ); + + // test list array + let data = vec![ + Some(vec![Some(1), Some(2), Some(3)]), + None, + Some(vec![Some(4), None, Some(6)]), + ]; + let arrow_array = ListArray::from_iter_primitive::(data); + + let v0 = arrow_array_get(&arrow_array, 0).unwrap(); + match v0 { + Value::List(list) => { + assert!(matches!(list.datatype(), ConcreteDataType::Int32(_))); + let items = list.items().as_ref().unwrap(); + assert_eq!( + **items, + vec![Value::Int32(1), Value::Int32(2), Value::Int32(3)] + ); + } + _ => unreachable!(), + } + + assert_eq!(Value::Null, arrow_array_get(&arrow_array, 1).unwrap()); + let v2 = arrow_array_get(&arrow_array, 2).unwrap(); + match v2 { + Value::List(list) => { + assert!(matches!(list.datatype(), ConcreteDataType::Int32(_))); + let items = list.items().as_ref().unwrap(); + assert_eq!(**items, vec![Value::Int32(4), Value::Null, Value::Int32(6)]); + } + _ => unreachable!(), + } + } } diff --git a/src/datatypes2/src/data_type.rs b/src/datatypes2/src/data_type.rs index 282546435f47..0d06d566b667 100644 --- a/src/datatypes2/src/data_type.rs +++ b/src/datatypes2/src/data_type.rs @@ -61,66 +61,67 @@ pub enum ConcreteDataType { List(ListType), } -// TODO(yingwen): Consider moving these methods to the DataType trait. +// TODO(yingwen): Refactor these `is_xxx()` methods, such as adding a `properties()` method +// returning all these properties to the `DataType` trait impl ConcreteDataType { - // pub fn is_float(&self) -> bool { - // matches!( - // self, - // ConcreteDataType::Float64(_) | ConcreteDataType::Float32(_) - // ) - // } - - // pub fn is_boolean(&self) -> bool { - // matches!(self, ConcreteDataType::Boolean(_)) - // } - - // pub fn stringifiable(&self) -> bool { - // matches!( - // self, - // ConcreteDataType::String(_) - // | ConcreteDataType::Date(_) - // | ConcreteDataType::DateTime(_) - // | ConcreteDataType::Timestamp(_) - // ) - // } - - // pub fn is_signed(&self) -> bool { - // matches!( - // self, - // ConcreteDataType::Int8(_) - // | ConcreteDataType::Int16(_) - // | ConcreteDataType::Int32(_) - // | ConcreteDataType::Int64(_) - // | ConcreteDataType::Date(_) - // | ConcreteDataType::DateTime(_) - // | ConcreteDataType::Timestamp(_) - // ) - // } - - // pub fn is_unsigned(&self) -> bool { - // matches!( - // self, - // ConcreteDataType::UInt8(_) - // | ConcreteDataType::UInt16(_) - // | ConcreteDataType::UInt32(_) - // | ConcreteDataType::UInt64(_) - // ) - // } - - // pub fn numerics() -> Vec { - // vec![ - // ConcreteDataType::int8_datatype(), - // ConcreteDataType::int16_datatype(), - // ConcreteDataType::int32_datatype(), - // ConcreteDataType::int64_datatype(), - // ConcreteDataType::uint8_datatype(), - // ConcreteDataType::uint16_datatype(), - // ConcreteDataType::uint32_datatype(), - // ConcreteDataType::uint64_datatype(), - // ConcreteDataType::float32_datatype(), - // ConcreteDataType::float64_datatype(), - // ] - // } + pub fn is_float(&self) -> bool { + matches!( + self, + ConcreteDataType::Float64(_) | ConcreteDataType::Float32(_) + ) + } + + pub fn is_boolean(&self) -> bool { + matches!(self, ConcreteDataType::Boolean(_)) + } + + pub fn is_stringifiable(&self) -> bool { + matches!( + self, + ConcreteDataType::String(_) + | ConcreteDataType::Date(_) + | ConcreteDataType::DateTime(_) + | ConcreteDataType::Timestamp(_) + ) + } + + pub fn is_signed(&self) -> bool { + matches!( + self, + ConcreteDataType::Int8(_) + | ConcreteDataType::Int16(_) + | ConcreteDataType::Int32(_) + | ConcreteDataType::Int64(_) + | ConcreteDataType::Date(_) + | ConcreteDataType::DateTime(_) + | ConcreteDataType::Timestamp(_) + ) + } + + pub fn is_unsigned(&self) -> bool { + matches!( + self, + ConcreteDataType::UInt8(_) + | ConcreteDataType::UInt16(_) + | ConcreteDataType::UInt32(_) + | ConcreteDataType::UInt64(_) + ) + } + + pub fn numerics() -> Vec { + vec![ + ConcreteDataType::int8_datatype(), + ConcreteDataType::int16_datatype(), + ConcreteDataType::int32_datatype(), + ConcreteDataType::int64_datatype(), + ConcreteDataType::uint8_datatype(), + ConcreteDataType::uint16_datatype(), + ConcreteDataType::uint32_datatype(), + ConcreteDataType::uint64_datatype(), + ConcreteDataType::float32_datatype(), + ConcreteDataType::float64_datatype(), + ] + } /// Convert arrow data type to [ConcreteDataType]. /// @@ -130,9 +131,9 @@ impl ConcreteDataType { ConcreteDataType::try_from(dt).expect("Unimplemented type") } - // pub fn is_null(&self) -> bool { - // matches!(self, ConcreteDataType::Null(NullType)) - // } + pub fn is_null(&self) -> bool { + matches!(self, ConcreteDataType::Null(NullType)) + } } impl TryFrom<&ArrowDataType> for ConcreteDataType { @@ -261,7 +262,6 @@ pub trait DataType: std::fmt::Debug + Send + Sync { pub type DataTypeRef = Arc; -// TODO(yingwen): Pass all tests. #[cfg(test)] mod tests { use arrow::datatypes::Field; @@ -401,9 +401,86 @@ mod tests { assert!(!ConcreteDataType::uint64_datatype().is_timestamp_compatible()); } - // #[test] - // fn test_is_null() { - // assert!(ConcreteDataType::null_datatype().is_null()); - // assert!(!ConcreteDataType::int32_datatype().is_null()); - // } + #[test] + fn test_is_null() { + assert!(ConcreteDataType::null_datatype().is_null()); + assert!(!ConcreteDataType::int32_datatype().is_null()); + } + + #[test] + fn test_is_float() { + assert!(!ConcreteDataType::int32_datatype().is_float()); + assert!(ConcreteDataType::float32_datatype().is_float()); + assert!(ConcreteDataType::float64_datatype().is_float()); + } + + #[test] + fn test_is_boolean() { + assert!(!ConcreteDataType::int32_datatype().is_boolean()); + assert!(!ConcreteDataType::float32_datatype().is_boolean()); + assert!(ConcreteDataType::boolean_datatype().is_boolean()); + } + + #[test] + fn test_is_stringifiable() { + assert!(!ConcreteDataType::int32_datatype().is_stringifiable()); + assert!(!ConcreteDataType::float32_datatype().is_stringifiable()); + assert!(ConcreteDataType::string_datatype().is_stringifiable()); + assert!(ConcreteDataType::date_datatype().is_stringifiable()); + assert!(ConcreteDataType::datetime_datatype().is_stringifiable()); + assert!(ConcreteDataType::timestamp_second_datatype().is_stringifiable()); + assert!(ConcreteDataType::timestamp_millisecond_datatype().is_stringifiable()); + assert!(ConcreteDataType::timestamp_microsecond_datatype().is_stringifiable()); + assert!(ConcreteDataType::timestamp_nanosecond_datatype().is_stringifiable()); + } + + #[test] + fn test_is_signed() { + assert!(ConcreteDataType::int8_datatype().is_signed()); + assert!(ConcreteDataType::int16_datatype().is_signed()); + assert!(ConcreteDataType::int32_datatype().is_signed()); + assert!(ConcreteDataType::int64_datatype().is_signed()); + assert!(ConcreteDataType::date_datatype().is_signed()); + assert!(ConcreteDataType::datetime_datatype().is_signed()); + assert!(ConcreteDataType::timestamp_second_datatype().is_signed()); + assert!(ConcreteDataType::timestamp_millisecond_datatype().is_signed()); + assert!(ConcreteDataType::timestamp_microsecond_datatype().is_signed()); + assert!(ConcreteDataType::timestamp_nanosecond_datatype().is_signed()); + + assert!(!ConcreteDataType::uint8_datatype().is_signed()); + assert!(!ConcreteDataType::uint16_datatype().is_signed()); + assert!(!ConcreteDataType::uint32_datatype().is_signed()); + assert!(!ConcreteDataType::uint64_datatype().is_signed()); + + assert!(!ConcreteDataType::float32_datatype().is_signed()); + assert!(!ConcreteDataType::float64_datatype().is_signed()); + } + + #[test] + fn test_is_unsigned() { + assert!(!ConcreteDataType::int8_datatype().is_unsigned()); + assert!(!ConcreteDataType::int16_datatype().is_unsigned()); + assert!(!ConcreteDataType::int32_datatype().is_unsigned()); + assert!(!ConcreteDataType::int64_datatype().is_unsigned()); + assert!(!ConcreteDataType::date_datatype().is_unsigned()); + assert!(!ConcreteDataType::datetime_datatype().is_unsigned()); + assert!(!ConcreteDataType::timestamp_second_datatype().is_unsigned()); + assert!(!ConcreteDataType::timestamp_millisecond_datatype().is_unsigned()); + assert!(!ConcreteDataType::timestamp_microsecond_datatype().is_unsigned()); + assert!(!ConcreteDataType::timestamp_nanosecond_datatype().is_unsigned()); + + assert!(ConcreteDataType::uint8_datatype().is_unsigned()); + assert!(ConcreteDataType::uint16_datatype().is_unsigned()); + assert!(ConcreteDataType::uint32_datatype().is_unsigned()); + assert!(ConcreteDataType::uint64_datatype().is_unsigned()); + + assert!(!ConcreteDataType::float32_datatype().is_unsigned()); + assert!(!ConcreteDataType::float64_datatype().is_unsigned()); + } + + #[test] + fn test_numerics() { + let nums = ConcreteDataType::numerics(); + assert_eq!(10, nums.len()); + } } diff --git a/src/datatypes2/src/scalars.rs b/src/datatypes2/src/scalars.rs index 446a432cf09b..327ebaa629a2 100644 --- a/src/datatypes2/src/scalars.rs +++ b/src/datatypes2/src/scalars.rs @@ -298,6 +298,8 @@ impl<'a> ScalarRef<'a> for DateTime { } } +// Timestamp types implement Scalar and ScalarRef in `src/timestamp.rs`. + impl Scalar for ListValue { type VectorType = ListVector; type RefType<'a> = ListValueRef<'a>; diff --git a/src/datatypes2/src/timestamp.rs b/src/datatypes2/src/timestamp.rs index 0498d883f282..f14e91a6c614 100644 --- a/src/datatypes2/src/timestamp.rs +++ b/src/datatypes2/src/timestamp.rs @@ -112,3 +112,24 @@ define_timestamp_with_unit!(Second); define_timestamp_with_unit!(Millisecond); define_timestamp_with_unit!(Microsecond); define_timestamp_with_unit!(Nanosecond); + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_timestamp_scalar() { + let ts = TimestampSecond::new(123); + assert_eq!(ts, ts.as_scalar_ref()); + assert_eq!(ts, ts.to_owned_scalar()); + let ts = TimestampMillisecond::new(123); + assert_eq!(ts, ts.as_scalar_ref()); + assert_eq!(ts, ts.to_owned_scalar()); + let ts = TimestampMicrosecond::new(123); + assert_eq!(ts, ts.as_scalar_ref()); + assert_eq!(ts, ts.to_owned_scalar()); + let ts = TimestampNanosecond::new(123); + assert_eq!(ts, ts.as_scalar_ref()); + assert_eq!(ts, ts.to_owned_scalar()); + } +} diff --git a/src/datatypes2/src/type_id.rs b/src/datatypes2/src/type_id.rs index 4fdda225fcf1..bcb7ea52b129 100644 --- a/src/datatypes2/src/type_id.rs +++ b/src/datatypes2/src/type_id.rs @@ -58,30 +58,36 @@ impl LogicalTypeId { /// Panics if data type is not supported. #[cfg(any(test, feature = "test"))] pub fn data_type(&self) -> crate::data_type::ConcreteDataType { - unimplemented!() - // use crate::data_type::ConcreteDataType; + use crate::data_type::ConcreteDataType; - // match self { - // LogicalTypeId::Null => ConcreteDataType::null_datatype(), - // LogicalTypeId::Boolean => ConcreteDataType::boolean_datatype(), - // LogicalTypeId::Int8 => ConcreteDataType::int8_datatype(), - // LogicalTypeId::Int16 => ConcreteDataType::int16_datatype(), - // LogicalTypeId::Int32 => ConcreteDataType::int32_datatype(), - // LogicalTypeId::Int64 => ConcreteDataType::int64_datatype(), - // LogicalTypeId::UInt8 => ConcreteDataType::uint8_datatype(), - // LogicalTypeId::UInt16 => ConcreteDataType::uint16_datatype(), - // LogicalTypeId::UInt32 => ConcreteDataType::uint32_datatype(), - // LogicalTypeId::UInt64 => ConcreteDataType::uint64_datatype(), - // LogicalTypeId::Float32 => ConcreteDataType::float32_datatype(), - // LogicalTypeId::Float64 => ConcreteDataType::float64_datatype(), - // LogicalTypeId::String => ConcreteDataType::string_datatype(), - // LogicalTypeId::Binary => ConcreteDataType::binary_datatype(), - // LogicalTypeId::Date => ConcreteDataType::date_datatype(), - // LogicalTypeId::DateTime => ConcreteDataType::datetime_datatype(), - // LogicalTypeId::Timestamp => ConcreteDataType::timestamp_millis_datatype(), // to timestamp type with default time unit - // LogicalTypeId::List => { - // ConcreteDataType::list_datatype(ConcreteDataType::null_datatype()) - // } - // } + match self { + LogicalTypeId::Null => ConcreteDataType::null_datatype(), + LogicalTypeId::Boolean => ConcreteDataType::boolean_datatype(), + LogicalTypeId::Int8 => ConcreteDataType::int8_datatype(), + LogicalTypeId::Int16 => ConcreteDataType::int16_datatype(), + LogicalTypeId::Int32 => ConcreteDataType::int32_datatype(), + LogicalTypeId::Int64 => ConcreteDataType::int64_datatype(), + LogicalTypeId::UInt8 => ConcreteDataType::uint8_datatype(), + LogicalTypeId::UInt16 => ConcreteDataType::uint16_datatype(), + LogicalTypeId::UInt32 => ConcreteDataType::uint32_datatype(), + LogicalTypeId::UInt64 => ConcreteDataType::uint64_datatype(), + LogicalTypeId::Float32 => ConcreteDataType::float32_datatype(), + LogicalTypeId::Float64 => ConcreteDataType::float64_datatype(), + LogicalTypeId::String => ConcreteDataType::string_datatype(), + LogicalTypeId::Binary => ConcreteDataType::binary_datatype(), + LogicalTypeId::Date => ConcreteDataType::date_datatype(), + LogicalTypeId::DateTime => ConcreteDataType::datetime_datatype(), + LogicalTypeId::TimestampSecond => ConcreteDataType::timestamp_second_datatype(), + LogicalTypeId::TimestampMillisecond => { + ConcreteDataType::timestamp_millisecond_datatype() + } + LogicalTypeId::TimestampMicrosecond => { + ConcreteDataType::timestamp_microsecond_datatype() + } + LogicalTypeId::TimestampNanosecond => ConcreteDataType::timestamp_nanosecond_datatype(), + LogicalTypeId::List => { + ConcreteDataType::list_datatype(ConcreteDataType::null_datatype()) + } + } } } diff --git a/src/datatypes2/src/value.rs b/src/datatypes2/src/value.rs index ef89ff395637..bade88d419af 100644 --- a/src/datatypes2/src/value.rs +++ b/src/datatypes2/src/value.rs @@ -393,9 +393,9 @@ impl Ord for ListValue { impl TryFrom for Value { type Error = error::Error; - // TODO(yingwen): Implement it. fn try_from(v: ScalarValue) -> Result { let v = match v { + ScalarValue::Null => Value::Null, ScalarValue::Boolean(b) => Value::from(b), ScalarValue::Float32(f) => Value::from(f), ScalarValue::Float64(f) => Value::from(f), @@ -410,7 +410,9 @@ impl TryFrom for Value { ScalarValue::Utf8(s) | ScalarValue::LargeUtf8(s) => { Value::from(s.map(StringBytes::from)) } - ScalarValue::Binary(b) | ScalarValue::LargeBinary(b) => Value::from(b.map(Bytes::from)), + ScalarValue::Binary(b) + | ScalarValue::LargeBinary(b) + | ScalarValue::FixedSizeBinary(_, b) => Value::from(b.map(Bytes::from)), ScalarValue::List(vs, field) => { let items = if let Some(vs) = vs { let vs = vs @@ -440,7 +442,13 @@ impl TryFrom for Value { ScalarValue::TimestampNanosecond(t, _) => t .map(|x| Value::Timestamp(Timestamp::new(x, TimeUnit::Nanosecond))) .unwrap_or(Value::Null), - _ => { + ScalarValue::Decimal128(_, _, _) + | ScalarValue::Time64(_) + | ScalarValue::IntervalYearMonth(_) + | ScalarValue::IntervalDayTime(_) + | ScalarValue::IntervalMonthDayNano(_) + | ScalarValue::Struct(_, _) + | ScalarValue::Dictionary(_, _) => { return error::UnsupportedArrowTypeSnafu { arrow_type: v.get_datatype(), } @@ -648,7 +656,6 @@ impl<'a> PartialOrd for ListValueRef<'a> { } } -// TODO(yingwen): Pass all tests. #[cfg(test)] mod tests { use arrow::datatypes::DataType as ArrowDataType; diff --git a/src/datatypes2/src/vectors.rs b/src/datatypes2/src/vectors.rs index b7dd1f2e2ede..38fa762d4b3c 100644 --- a/src/datatypes2/src/vectors.rs +++ b/src/datatypes2/src/vectors.rs @@ -40,16 +40,27 @@ mod string; mod timestamp; mod validity; -pub use binary::*; -pub use boolean::*; -pub use date::*; -pub use datetime::*; +pub use binary::{BinaryVector, BinaryVectorBuilder}; +pub use boolean::{BooleanVector, BooleanVectorBuilder}; +pub use constant::ConstantVector; +pub use date::{DateVector, DateVectorBuilder}; +pub use datetime::{DateTimeVector, DateTimeVectorBuilder}; pub use helper::Helper; -pub use list::*; -pub use null::*; -pub use primitive::*; -pub use string::*; -pub use timestamp::*; +pub use list::{ListIter, ListVector, ListVectorBuilder}; +pub use null::{NullVector, NullVectorBuilder}; +pub use primitive::{ + Float32Vector, Float32VectorBuilder, Float64Vector, Float64VectorBuilder, Int16Vector, + Int16VectorBuilder, Int32Vector, Int32VectorBuilder, Int64Vector, Int64VectorBuilder, + Int8Vector, Int8VectorBuilder, PrimitiveIter, PrimitiveVector, PrimitiveVectorBuilder, + UInt16Vector, UInt16VectorBuilder, UInt32Vector, UInt32VectorBuilder, UInt64Vector, + UInt64VectorBuilder, UInt8Vector, UInt8VectorBuilder, +}; +pub use string::{StringVector, StringVectorBuilder}; +pub use timestamp::{ + TimestampMicrosecondVector, TimestampMicrosecondVectorBuilder, TimestampMillisecondVector, + TimestampMillisecondVectorBuilder, TimestampNanosecondVector, TimestampNanosecondVectorBuilder, + TimestampSecondVector, TimestampSecondVectorBuilder, +}; pub use validity::Validity; // TODO(yingwen): arrow 28.0 implements Clone for all arrays, we could upgrade to it and simplify diff --git a/src/datatypes2/src/vectors/eq.rs b/src/datatypes2/src/vectors/eq.rs index a93b6c27055b..55359026d479 100644 --- a/src/datatypes2/src/vectors/eq.rs +++ b/src/datatypes2/src/vectors/eq.rs @@ -47,7 +47,6 @@ macro_rules! is_vector_eq { }}; } -// TODO(yingwen): Impl eq for other vectors. fn equal(lhs: &dyn Vector, rhs: &dyn Vector) -> bool { if lhs.data_type() != rhs.data_type() || lhs.len() != rhs.len() { return false; @@ -114,7 +113,7 @@ fn equal(lhs: &dyn Vector, rhs: &dyn Vector) -> bool { mod tests { use super::*; use crate::vectors::{ - Float32Vector, Float64Vector, Int16Vector, Int32Vector, Int64Vector, Int8Vector, + list, Float32Vector, Float64Vector, Int16Vector, Int32Vector, Int64Vector, Int8Vector, NullVector, UInt16Vector, UInt32Vector, UInt64Vector, UInt8Vector, VectorRef, }; @@ -150,24 +149,23 @@ mod tests { assert_vector_ref_eq(Arc::new(TimestampMillisecondVector::from_values([ 100, 120, ]))); - assert_vector_ref_eq(Arc::new(TimestampMicrosecondVector::from_values([ 100, 120, ]))); assert_vector_ref_eq(Arc::new(TimestampNanosecondVector::from_values([100, 120]))); - // let mut arrow_array = MutableListArray::>::new(); - // arrow_array - // .try_extend(vec![Some(vec![Some(1), Some(2), Some(3)])]) - // .unwrap(); - // let arrow_array: ListArray = arrow_array.into(); - // assert_vector_ref_eq(Arc::new(ListVector::from(arrow_array))); - - // assert_vector_ref_eq(Arc::new(NullVector::new(4))); - // assert_vector_ref_eq(Arc::new(StringVector::from(vec![ - // Some("hello"), - // Some("world"), - // ]))); + let list_vector = list::tests::new_list_vector(&[ + Some(vec![Some(1), Some(2)]), + None, + Some(vec![Some(3), Some(4)]), + ]); + assert_vector_ref_eq(Arc::new(list_vector)); + + assert_vector_ref_eq(Arc::new(NullVector::new(4))); + assert_vector_ref_eq(Arc::new(StringVector::from(vec![ + Some("hello"), + Some("world"), + ]))); assert_vector_ref_eq(Arc::new(Int8Vector::from_slice(&[1, 2, 3, 4]))); assert_vector_ref_eq(Arc::new(UInt8Vector::from_slice(&[1, 2, 3, 4]))); diff --git a/src/datatypes2/src/vectors/helper.rs b/src/datatypes2/src/vectors/helper.rs index 572871f91ad3..f3236ca0ec42 100644 --- a/src/datatypes2/src/vectors/helper.rs +++ b/src/datatypes2/src/vectors/helper.rs @@ -17,20 +17,26 @@ use std::any::Any; use std::sync::Arc; -use arrow::array::{Array, ArrayRef}; +use arrow::array::{Array, ArrayRef, StringArray}; +use arrow::compute; +use arrow::compute::kernels::comparison; use arrow::datatypes::{DataType as ArrowDataType, TimeUnit}; -use snafu::OptionExt; +use datafusion_common::ScalarValue; +use snafu::{OptionExt, ResultExt}; +use crate::data_type::ConcreteDataType; use crate::error::{self, Result}; -use crate::scalars::Scalar; +use crate::scalars::{Scalar, ScalarVectorBuilder}; +use crate::value::{ListValue, ListValueRef}; use crate::vectors::{ - BinaryVector, BooleanVector, DateTimeVector, DateVector, Float32Vector, Float64Vector, - Int16Vector, Int32Vector, Int64Vector, Int8Vector, ListVector, MutableVector, NullVector, - StringVector, TimestampMicrosecondVector, TimestampMillisecondVector, - TimestampNanosecondVector, TimestampSecondVector, UInt16Vector, UInt32Vector, UInt64Vector, - UInt8Vector, Vector, VectorRef, + BinaryVector, BooleanVector, ConstantVector, DateTimeVector, DateVector, Float32Vector, + Float64Vector, Int16Vector, Int32Vector, Int64Vector, Int8Vector, ListVector, + ListVectorBuilder, MutableVector, NullVector, StringVector, TimestampMicrosecondVector, + TimestampMillisecondVector, TimestampNanosecondVector, TimestampSecondVector, UInt16Vector, + UInt32Vector, UInt64Vector, UInt8Vector, Vector, VectorRef, }; +/// Helper functions for `Vector`. pub struct Helper; impl Helper { @@ -105,73 +111,105 @@ impl Helper { arr } - // /// Try to cast an arrow scalar value into vector - // /// - // /// # Panics - // /// Panic if given scalar value is not supported. - // pub fn try_from_scalar_value(value: ScalarValue, length: usize) -> Result { - // let vector = match value { - // ScalarValue::Boolean(v) => { - // ConstantVector::new(Arc::new(BooleanVector::from(vec![v])), length) - // } - // ScalarValue::Float32(v) => { - // ConstantVector::new(Arc::new(Float32Vector::from(vec![v])), length) - // } - // ScalarValue::Float64(v) => { - // ConstantVector::new(Arc::new(Float64Vector::from(vec![v])), length) - // } - // ScalarValue::Int8(v) => { - // ConstantVector::new(Arc::new(Int8Vector::from(vec![v])), length) - // } - // ScalarValue::Int16(v) => { - // ConstantVector::new(Arc::new(Int16Vector::from(vec![v])), length) - // } - // ScalarValue::Int32(v) => { - // ConstantVector::new(Arc::new(Int32Vector::from(vec![v])), length) - // } - // ScalarValue::Int64(v) => { - // ConstantVector::new(Arc::new(Int64Vector::from(vec![v])), length) - // } - // ScalarValue::UInt8(v) => { - // ConstantVector::new(Arc::new(UInt8Vector::from(vec![v])), length) - // } - // ScalarValue::UInt16(v) => { - // ConstantVector::new(Arc::new(UInt16Vector::from(vec![v])), length) - // } - // ScalarValue::UInt32(v) => { - // ConstantVector::new(Arc::new(UInt32Vector::from(vec![v])), length) - // } - // ScalarValue::UInt64(v) => { - // ConstantVector::new(Arc::new(UInt64Vector::from(vec![v])), length) - // } - // ScalarValue::Utf8(v) => { - // ConstantVector::new(Arc::new(StringVector::from(vec![v])), length) - // } - // ScalarValue::LargeUtf8(v) => { - // ConstantVector::new(Arc::new(StringVector::from(vec![v])), length) - // } - // ScalarValue::Binary(v) => { - // ConstantVector::new(Arc::new(BinaryVector::from(vec![v])), length) - // } - // ScalarValue::LargeBinary(v) => { - // ConstantVector::new(Arc::new(BinaryVector::from(vec![v])), length) - // } - // ScalarValue::Date32(v) => { - // ConstantVector::new(Arc::new(DateVector::from(vec![v])), length) - // } - // ScalarValue::Date64(v) => { - // ConstantVector::new(Arc::new(DateTimeVector::from(vec![v])), length) - // } - // _ => { - // return error::ConversionSnafu { - // from: format!("Unsupported scalar value: {}", value), - // } - // .fail() - // } - // }; - - // Ok(Arc::new(vector)) - // } + /// Try to cast an arrow scalar value into vector + pub fn try_from_scalar_value(value: ScalarValue, length: usize) -> Result { + let vector = match value { + ScalarValue::Null => ConstantVector::new(Arc::new(NullVector::new(1)), length), + ScalarValue::Boolean(v) => { + ConstantVector::new(Arc::new(BooleanVector::from(vec![v])), length) + } + ScalarValue::Float32(v) => { + ConstantVector::new(Arc::new(Float32Vector::from(vec![v])), length) + } + ScalarValue::Float64(v) => { + ConstantVector::new(Arc::new(Float64Vector::from(vec![v])), length) + } + ScalarValue::Int8(v) => { + ConstantVector::new(Arc::new(Int8Vector::from(vec![v])), length) + } + ScalarValue::Int16(v) => { + ConstantVector::new(Arc::new(Int16Vector::from(vec![v])), length) + } + ScalarValue::Int32(v) => { + ConstantVector::new(Arc::new(Int32Vector::from(vec![v])), length) + } + ScalarValue::Int64(v) => { + ConstantVector::new(Arc::new(Int64Vector::from(vec![v])), length) + } + ScalarValue::UInt8(v) => { + ConstantVector::new(Arc::new(UInt8Vector::from(vec![v])), length) + } + ScalarValue::UInt16(v) => { + ConstantVector::new(Arc::new(UInt16Vector::from(vec![v])), length) + } + ScalarValue::UInt32(v) => { + ConstantVector::new(Arc::new(UInt32Vector::from(vec![v])), length) + } + ScalarValue::UInt64(v) => { + ConstantVector::new(Arc::new(UInt64Vector::from(vec![v])), length) + } + ScalarValue::Utf8(v) | ScalarValue::LargeUtf8(v) => { + ConstantVector::new(Arc::new(StringVector::from(vec![v])), length) + } + ScalarValue::Binary(v) + | ScalarValue::LargeBinary(v) + | ScalarValue::FixedSizeBinary(_, v) => { + ConstantVector::new(Arc::new(BinaryVector::from(vec![v])), length) + } + ScalarValue::List(v, field) => { + let item_type = ConcreteDataType::try_from(field.data_type())?; + let mut builder = ListVectorBuilder::with_type_capacity(item_type.clone(), 1); + if let Some(values) = v { + let values = values + .into_iter() + .map(ScalarValue::try_into) + .collect::>()?; + let list_value = ListValue::new(Some(Box::new(values)), item_type); + builder.push(Some(ListValueRef::Ref { val: &list_value })); + } else { + builder.push(None); + } + let list_vector = builder.to_vector(); + ConstantVector::new(list_vector, length) + } + ScalarValue::Date32(v) => { + ConstantVector::new(Arc::new(DateVector::from(vec![v])), length) + } + ScalarValue::Date64(v) => { + ConstantVector::new(Arc::new(DateTimeVector::from(vec![v])), length) + } + ScalarValue::TimestampSecond(v, _) => { + // Timezone is unimplemented now. + ConstantVector::new(Arc::new(TimestampSecondVector::from(vec![v])), length) + } + ScalarValue::TimestampMillisecond(v, _) => { + // Timezone is unimplemented now. + ConstantVector::new(Arc::new(TimestampMillisecondVector::from(vec![v])), length) + } + ScalarValue::TimestampMicrosecond(v, _) => { + // Timezone is unimplemented now. + ConstantVector::new(Arc::new(TimestampMicrosecondVector::from(vec![v])), length) + } + ScalarValue::TimestampNanosecond(v, _) => { + // Timezone is unimplemented now. + ConstantVector::new(Arc::new(TimestampNanosecondVector::from(vec![v])), length) + } + ScalarValue::Decimal128(_, _, _) + | ScalarValue::Time64(_) + | ScalarValue::IntervalYearMonth(_) + | ScalarValue::IntervalDayTime(_) + | ScalarValue::IntervalMonthDayNano(_) + | ScalarValue::Struct(_, _) + | ScalarValue::Dictionary(_, _) => { + return error::ConversionSnafu { + from: format!("Unsupported scalar value: {}", value), + } + .fail() + } + }; + + Ok(Arc::new(vector)) + } /// Try to cast an arrow array into vector /// @@ -181,9 +219,7 @@ impl Helper { Ok(match array.as_ref().data_type() { ArrowDataType::Null => Arc::new(NullVector::try_from_arrow_array(array)?), ArrowDataType::Boolean => Arc::new(BooleanVector::try_from_arrow_array(array)?), - ArrowDataType::Binary | ArrowDataType::LargeBinary => { - Arc::new(BinaryVector::try_from_arrow_array(array)?) - } + ArrowDataType::LargeBinary => Arc::new(BinaryVector::try_from_arrow_array(array)?), ArrowDataType::Int8 => Arc::new(Int8Vector::try_from_arrow_array(array)?), ArrowDataType::Int16 => Arc::new(Int16Vector::try_from_arrow_array(array)?), ArrowDataType::Int32 => Arc::new(Int32Vector::try_from_arrow_array(array)?), @@ -194,9 +230,7 @@ impl Helper { ArrowDataType::UInt64 => Arc::new(UInt64Vector::try_from_arrow_array(array)?), ArrowDataType::Float32 => Arc::new(Float32Vector::try_from_arrow_array(array)?), ArrowDataType::Float64 => Arc::new(Float64Vector::try_from_arrow_array(array)?), - ArrowDataType::Utf8 | ArrowDataType::LargeUtf8 => { - Arc::new(StringVector::try_from_arrow_array(array)?) - } + ArrowDataType::Utf8 => Arc::new(StringVector::try_from_arrow_array(array)?), ArrowDataType::Date32 => Arc::new(DateVector::try_from_arrow_array(array)?), ArrowDataType::Date64 => Arc::new(DateTimeVector::try_from_arrow_array(array)?), ArrowDataType::List(_) => Arc::new(ListVector::try_from_arrow_array(array)?), @@ -212,31 +246,57 @@ impl Helper { Arc::new(TimestampNanosecondVector::try_from_arrow_array(array)?) } }, - _ => unimplemented!("Arrow array datatype: {:?}", array.as_ref().data_type()), + ArrowDataType::Float16 + | ArrowDataType::Time32(_) + | ArrowDataType::Time64(_) + | ArrowDataType::Duration(_) + | ArrowDataType::Interval(_) + | ArrowDataType::Binary + | ArrowDataType::FixedSizeBinary(_) + | ArrowDataType::LargeUtf8 + | ArrowDataType::LargeList(_) + | ArrowDataType::FixedSizeList(_, _) + | ArrowDataType::Struct(_) + | ArrowDataType::Union(_, _, _) + | ArrowDataType::Dictionary(_, _) + | ArrowDataType::Decimal128(_, _) + | ArrowDataType::Decimal256(_, _) + | ArrowDataType::Map(_, _) => { + unimplemented!("Arrow array datatype: {:?}", array.as_ref().data_type()) + } }) } + /// Try to cast slice of `arrays` to vectors. pub fn try_into_vectors(arrays: &[ArrayRef]) -> Result> { arrays.iter().map(Self::try_into_vector).collect() } - // pub fn like_utf8(names: Vec, s: &str) -> Result { - // let array = StringArray::from_slice(&names); + /// Perform SQL like operation on `names` and a scalar `s`. + pub fn like_utf8(names: Vec, s: &str) -> Result { + let array = StringArray::from(names); - // let filter = - // compute::like::like_utf8_scalar(&array, s).context(error::ArrowComputeSnafu)?; + let filter = comparison::like_utf8_scalar(&array, s).context(error::ArrowComputeSnafu)?; - // let result = compute::filter::filter(&array, &filter).context(error::ArrowComputeSnafu)?; - // Helper::try_into_vector(result) - // } + let result = compute::filter(&array, &filter).context(error::ArrowComputeSnafu)?; + Helper::try_into_vector(result) + } } #[cfg(test)] mod tests { - use arrow::array::Int32Array; + use arrow::array::{ + ArrayRef, BooleanArray, Date32Array, Date64Array, Float32Array, Float64Array, Int16Array, + Int32Array, Int64Array, Int8Array, LargeBinaryArray, ListArray, NullArray, + TimestampMicrosecondArray, TimestampMillisecondArray, TimestampNanosecondArray, + TimestampSecondArray, UInt16Array, UInt32Array, UInt64Array, UInt8Array, + }; + use arrow::datatypes::{Field, Int32Type}; + use common_time::{Date, DateTime}; use super::*; use crate::value::Value; + use crate::vectors::ConcreteDataType; #[test] fn test_try_into_vectors() { @@ -258,7 +318,7 @@ mod tests { fn test_try_into_date_vector() { let vector = DateVector::from(vec![Some(1), Some(2), None]); let arrow_array = vector.to_arrow_array(); - assert_eq!(&arrow::datatypes::DataType::Date32, arrow_array.data_type()); + assert_eq!(&ArrowDataType::Date32, arrow_array.data_type()); let vector_converted = Helper::try_into_vector(arrow_array).unwrap(); assert_eq!(vector.len(), vector_converted.len()); for i in 0..vector_converted.len() { @@ -266,48 +326,106 @@ mod tests { } } - // #[test] - // fn test_try_from_scalar_date_value() { - // let vector = Helper::try_from_scalar_value(ScalarValue::Date32(Some(42)), 3).unwrap(); - // assert_eq!(ConcreteDataType::date_datatype(), vector.data_type()); - // assert_eq!(3, vector.len()); - // for i in 0..vector.len() { - // assert_eq!(Value::Date(Date::new(42)), vector.get(i)); - // } - // } - - // #[test] - // fn test_try_from_scalar_datetime_value() { - // let vector = Helper::try_from_scalar_value(ScalarValue::Date64(Some(42)), 3).unwrap(); - // assert_eq!(ConcreteDataType::datetime_datatype(), vector.data_type()); - // assert_eq!(3, vector.len()); - // for i in 0..vector.len() { - // assert_eq!(Value::DateTime(DateTime::new(42)), vector.get(i)); - // } - // } - - // #[test] - // fn test_like_utf8() { - // fn assert_vector(expected: Vec<&str>, actual: &VectorRef) { - // let actual = actual.as_any().downcast_ref::().unwrap(); - // assert_eq!(*actual, StringVector::from(expected)); - // } - - // let names: Vec = vec!["greptime", "hello", "public", "world"] - // .into_iter() - // .map(|x| x.to_string()) - // .collect(); - - // let ret = Helper::like_utf8(names.clone(), "%ll%").unwrap(); - // assert_vector(vec!["hello"], &ret); - - // let ret = Helper::like_utf8(names.clone(), "%time").unwrap(); - // assert_vector(vec!["greptime"], &ret); - - // let ret = Helper::like_utf8(names.clone(), "%ld").unwrap(); - // assert_vector(vec!["world"], &ret); - - // let ret = Helper::like_utf8(names, "%").unwrap(); - // assert_vector(vec!["greptime", "hello", "public", "world"], &ret); - // } + #[test] + fn test_try_from_scalar_date_value() { + let vector = Helper::try_from_scalar_value(ScalarValue::Date32(Some(42)), 3).unwrap(); + assert_eq!(ConcreteDataType::date_datatype(), vector.data_type()); + assert_eq!(3, vector.len()); + for i in 0..vector.len() { + assert_eq!(Value::Date(Date::new(42)), vector.get(i)); + } + } + + #[test] + fn test_try_from_scalar_datetime_value() { + let vector = Helper::try_from_scalar_value(ScalarValue::Date64(Some(42)), 3).unwrap(); + assert_eq!(ConcreteDataType::datetime_datatype(), vector.data_type()); + assert_eq!(3, vector.len()); + for i in 0..vector.len() { + assert_eq!(Value::DateTime(DateTime::new(42)), vector.get(i)); + } + } + + #[test] + fn test_try_from_list_value() { + let value = ScalarValue::List( + Some(vec![ + ScalarValue::Int32(Some(1)), + ScalarValue::Int32(Some(2)), + ]), + Box::new(Field::new("item", ArrowDataType::Int32, true)), + ); + let vector = Helper::try_from_scalar_value(value, 3).unwrap(); + assert_eq!( + ConcreteDataType::list_datatype(ConcreteDataType::int32_datatype()), + vector.data_type() + ); + assert_eq!(3, vector.len()); + for i in 0..vector.len() { + let v = vector.get(i); + let items = v.as_list().unwrap().unwrap().items().as_ref().unwrap(); + assert_eq!(vec![Value::Int32(1), Value::Int32(2)], **items); + } + } + + #[test] + fn test_like_utf8() { + fn assert_vector(expected: Vec<&str>, actual: &VectorRef) { + let actual = actual.as_any().downcast_ref::().unwrap(); + assert_eq!(*actual, StringVector::from(expected)); + } + + let names: Vec = vec!["greptime", "hello", "public", "world"] + .into_iter() + .map(|x| x.to_string()) + .collect(); + + let ret = Helper::like_utf8(names.clone(), "%ll%").unwrap(); + assert_vector(vec!["hello"], &ret); + + let ret = Helper::like_utf8(names.clone(), "%time").unwrap(); + assert_vector(vec!["greptime"], &ret); + + let ret = Helper::like_utf8(names.clone(), "%ld").unwrap(); + assert_vector(vec!["world"], &ret); + + let ret = Helper::like_utf8(names, "%").unwrap(); + assert_vector(vec!["greptime", "hello", "public", "world"], &ret); + } + + fn check_try_into_vector(array: impl Array + 'static) { + let array: ArrayRef = Arc::new(array); + let vector = Helper::try_into_vector(array.clone()).unwrap(); + assert_eq!(&array, &vector.to_arrow_array()); + } + + #[test] + fn test_try_into_vector() { + check_try_into_vector(NullArray::new(2)); + check_try_into_vector(BooleanArray::from(vec![true, false])); + check_try_into_vector(LargeBinaryArray::from(vec![ + "hello".as_bytes(), + "world".as_bytes(), + ])); + check_try_into_vector(Int8Array::from(vec![1, 2, 3])); + check_try_into_vector(Int16Array::from(vec![1, 2, 3])); + check_try_into_vector(Int32Array::from(vec![1, 2, 3])); + check_try_into_vector(Int64Array::from(vec![1, 2, 3])); + check_try_into_vector(UInt8Array::from(vec![1, 2, 3])); + check_try_into_vector(UInt16Array::from(vec![1, 2, 3])); + check_try_into_vector(UInt32Array::from(vec![1, 2, 3])); + check_try_into_vector(UInt64Array::from(vec![1, 2, 3])); + check_try_into_vector(Float32Array::from(vec![1.0, 2.0, 3.0])); + check_try_into_vector(Float64Array::from(vec![1.0, 2.0, 3.0])); + check_try_into_vector(StringArray::from(vec!["hello", "world"])); + check_try_into_vector(Date32Array::from(vec![1, 2, 3])); + check_try_into_vector(Date64Array::from(vec![1, 2, 3])); + let data = vec![None, Some(vec![Some(6), Some(7)])]; + let list_array = ListArray::from_iter_primitive::(data); + check_try_into_vector(list_array); + check_try_into_vector(TimestampSecondArray::from(vec![1, 2, 3])); + check_try_into_vector(TimestampMillisecondArray::from(vec![1, 2, 3])); + check_try_into_vector(TimestampMicrosecondArray::from(vec![1, 2, 3])); + check_try_into_vector(TimestampNanosecondArray::from(vec![1, 2, 3])); + } } diff --git a/src/datatypes2/src/vectors/list.rs b/src/datatypes2/src/vectors/list.rs index 37d718d990bd..747e03557ba2 100644 --- a/src/datatypes2/src/vectors/list.rs +++ b/src/datatypes2/src/vectors/list.rs @@ -459,7 +459,7 @@ impl NullBufferBuilder { } #[cfg(test)] -mod tests { +pub mod tests { use arrow::array::{Int32Array, Int32Builder, ListBuilder}; use serde_json::json; @@ -468,7 +468,7 @@ mod tests { use crate::types::ListType; use crate::vectors::Int32Vector; - fn new_list_vector(data: &[Option>>]) -> ListVector { + pub fn new_list_vector(data: &[Option>>]) -> ListVector { let mut builder = ListVectorBuilder::with_type_capacity(ConcreteDataType::int32_datatype(), 8); for vec_opt in data { @@ -503,7 +503,6 @@ mod tests { builder.finish() } - // TODO(yingwen): Pass validity tests. #[test] fn test_list_vector() { let data = vec![ @@ -533,10 +532,12 @@ mod tests { .downcast_ref::() .unwrap() ); - // assert_eq!( - // Validity::Slots(arrow_array.validity().unwrap()), - // list_vector.validity() - // ); + let validity = list_vector.validity(); + assert!(!validity.is_all_null()); + assert!(!validity.is_all_valid()); + assert!(validity.is_set(0)); + assert!(!validity.is_set(1)); + assert!(validity.is_set(2)); assert_eq!(256, list_vector.memory_size()); let slice = list_vector.slice(0, 2).to_arrow_array(); diff --git a/src/datatypes2/src/vectors/operations.rs b/src/datatypes2/src/vectors/operations.rs index 404f620f378f..70ddb4a0317a 100644 --- a/src/datatypes2/src/vectors/operations.rs +++ b/src/datatypes2/src/vectors/operations.rs @@ -60,10 +60,10 @@ pub trait VectorOp { } macro_rules! impl_scalar_vector_op { - ($( { $VectorType: ident, $replicate: ident } ),+) => {$( + ($($VectorType: ident),+) => {$( impl VectorOp for $VectorType { fn replicate(&self, offsets: &[usize]) -> VectorRef { - replicate::$replicate(self, offsets) + replicate::replicate_scalar(self, offsets) } fn find_unique(&self, selected: &mut BitVec, prev_vector: Option<&dyn Vector>) { @@ -78,15 +78,7 @@ macro_rules! impl_scalar_vector_op { )+}; } -impl_scalar_vector_op!( - { BinaryVector, replicate_scalar }, - { BooleanVector, replicate_scalar }, - { ListVector, replicate_scalar }, - { StringVector, replicate_scalar } - // { DateVector, replicate_date }, - // { DateTimeVector, replicate_datetime }, - // { TimestampVector, replicate_timestamp } -); +impl_scalar_vector_op!(BinaryVector, BooleanVector, ListVector, StringVector); impl VectorOp for PrimitiveVector { fn replicate(&self, offsets: &[usize]) -> VectorRef { diff --git a/src/datatypes2/src/vectors/operations/filter.rs b/src/datatypes2/src/vectors/operations/filter.rs index f0aa1e34486b..8368a6afb4c4 100644 --- a/src/datatypes2/src/vectors/operations/filter.rs +++ b/src/datatypes2/src/vectors/operations/filter.rs @@ -12,8 +12,6 @@ // See the License for the specific language governing permissions and // limitations under the License. -// pub(crate) use crate::vectors::constant::filter_constant; - macro_rules! filter_non_constant { ($vector: expr, $VectorType: ty, $filter: ident) => {{ use std::sync::Arc; diff --git a/src/datatypes2/src/vectors/operations/replicate.rs b/src/datatypes2/src/vectors/operations/replicate.rs index bad6d2cc8c10..8216517fc62d 100644 --- a/src/datatypes2/src/vectors/operations/replicate.rs +++ b/src/datatypes2/src/vectors/operations/replicate.rs @@ -13,11 +13,8 @@ // limitations under the License. use crate::prelude::*; -// pub(crate) use crate::vectors::date::replicate_date; -// pub(crate) use crate::vectors::datetime::replicate_datetime; pub(crate) use crate::vectors::null::replicate_null; pub(crate) use crate::vectors::primitive::replicate_primitive; -// pub(crate) use crate::vectors::timestamp::replicate_timestamp; pub(crate) fn replicate_scalar(c: &C, offsets: &[usize]) -> VectorRef { assert_eq!(offsets.len(), c.len()); @@ -48,7 +45,7 @@ mod tests { use super::*; use crate::vectors::constant::ConstantVector; - use crate::vectors::{Int32Vector, StringVector, VectorOp}; + use crate::vectors::{Int32Vector, NullVector, StringVector, VectorOp}; #[test] fn test_replicate_primitive() { @@ -108,19 +105,19 @@ mod tests { assert_eq!(expect, cv); } - // #[test] - // fn test_replicate_null() { - // let v = NullVector::new(0); - // let offsets = []; - // let v = v.replicate(&offsets); - // assert!(v.is_empty()); + #[test] + fn test_replicate_null() { + let v = NullVector::new(0); + let offsets = []; + let v = v.replicate(&offsets); + assert!(v.is_empty()); - // let v = NullVector::new(3); - // let offsets = [1, 3, 5]; + let v = NullVector::new(3); + let offsets = [1, 3, 5]; - // let v = v.replicate(&offsets); - // assert_eq!(5, v.len()); - // } + let v = v.replicate(&offsets); + assert_eq!(5, v.len()); + } macro_rules! impl_replicate_date_like_test { ($VectorType: ident, $ValueType: ident, $method: ident) => {{