From 31939cb5f049c10cbed6b662215a9bdde1b444fe Mon Sep 17 00:00:00 2001 From: Yijun Zhao Date: Wed, 21 Feb 2024 18:35:11 +0800 Subject: [PATCH] add tests for string_view type --- .../arrow/src/arrow/array/binview/ffi.rs | 2 +- .../arrow/src/arrow/array/binview/from.rs | 24 +++ .../arrow/src/arrow/array/binview/mod.rs | 101 +++++++++- .../arrow/src/arrow/array/binview/mutable.rs | 24 ++- .../io/parquet/read/deserialize/utils.rs | 2 +- .../arrow/tests/it/arrow/array/binview/mod.rs | 189 ++++++++++++++++++ .../tests/it/arrow/array/binview/mutable.rs | 50 +++++ .../it/arrow/array/binview/mutable_values.rs | 31 +++ .../it/arrow/array/binview/to_mutable.rs | 46 +++++ src/common/arrow/tests/it/arrow/array/mod.rs | 1 + 10 files changed, 461 insertions(+), 9 deletions(-) create mode 100644 src/common/arrow/src/arrow/array/binview/from.rs create mode 100644 src/common/arrow/tests/it/arrow/array/binview/mod.rs create mode 100644 src/common/arrow/tests/it/arrow/array/binview/mutable.rs create mode 100644 src/common/arrow/tests/it/arrow/array/binview/mutable_values.rs create mode 100644 src/common/arrow/tests/it/arrow/array/binview/to_mutable.rs diff --git a/src/common/arrow/src/arrow/array/binview/ffi.rs b/src/common/arrow/src/arrow/array/binview/ffi.rs index d6e02c11b284..b28b349c5244 100644 --- a/src/common/arrow/src/arrow/array/binview/ffi.rs +++ b/src/common/arrow/src/arrow/array/binview/ffi.rs @@ -79,7 +79,7 @@ impl FromFfi for BinaryViewArray let validity = unsafe { array.validity() }?; let views = unsafe { array.buffer::(1) }?; - // 2 - validity + views + // n_buffers - 2, 2 means validity + views let n_buffers = array.n_buffers(); let mut remaining_buffers = n_buffers - 2; if remaining_buffers <= 1 { diff --git a/src/common/arrow/src/arrow/array/binview/from.rs b/src/common/arrow/src/arrow/array/binview/from.rs new file mode 100644 index 000000000000..7559b19d8f54 --- /dev/null +++ b/src/common/arrow/src/arrow/array/binview/from.rs @@ -0,0 +1,24 @@ +// Copyright 2021 Datafuse Labs +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use crate::arrow::array::BinaryViewArrayGeneric; +use crate::arrow::array::MutableBinaryViewArray; +use crate::arrow::array::ViewType; + +impl> FromIterator> for BinaryViewArrayGeneric { + #[inline] + fn from_iter>>(iter: I) -> Self { + MutableBinaryViewArray::::from_iter(iter).into() + } +} diff --git a/src/common/arrow/src/arrow/array/binview/mod.rs b/src/common/arrow/src/arrow/array/binview/mod.rs index 3bf310087b82..4d4637a95a8c 100644 --- a/src/common/arrow/src/arrow/array/binview/mod.rs +++ b/src/common/arrow/src/arrow/array/binview/mod.rs @@ -15,6 +15,7 @@ mod ffi; pub(crate) mod fmt; +mod from; mod iterator; mod mutable; mod view; @@ -23,6 +24,7 @@ mod private { pub trait Sealed: Send + Sync {} impl Sealed for str {} + impl Sealed for [u8] {} } @@ -33,6 +35,7 @@ use std::sync::atomic::AtomicU64; use std::sync::atomic::Ordering; use std::sync::Arc; +use either::Either; pub use iterator::BinaryViewValueIter; pub use mutable::MutableBinaryViewArray; use private::Sealed; @@ -70,7 +73,7 @@ pub trait ViewType: Sealed + 'static + PartialEq + AsRef { #[allow(clippy::wrong_self_convention)] fn into_owned(&self) -> Self::Owned; - fn dtype() -> &'static DataType; + fn data_type() -> &'static DataType; } impl ViewType for str { @@ -92,7 +95,7 @@ impl ViewType for str { self.to_string() } - fn dtype() -> &'static DataType { + fn data_type() -> &'static DataType { &UTF8_VIEW_TYPE } } @@ -116,7 +119,7 @@ impl ViewType for [u8] { self.to_vec() } - fn dtype() -> &'static DataType { + fn data_type() -> &'static DataType { &BIN_VIEW_TYPE } } @@ -157,6 +160,7 @@ impl Clone for BinaryViewArrayGeneric { } unsafe impl Send for BinaryViewArrayGeneric {} + unsafe impl Sync for BinaryViewArrayGeneric {} fn buffers_into_raw(buffers: &[Buffer]) -> Arc<[(*const T, usize)]> { @@ -233,6 +237,11 @@ impl BinaryViewArrayGeneric { buffers: Arc<[Buffer]>, validity: Option, ) -> Result { + if data_type.to_physical_type() != Self::default_data_type().to_physical_type() { + return Err(Error::oos( + "BinaryViewArray can only be initialized with DataType::BinaryView or DataType::Utf8View", + )); + } if T::IS_UTF8 { validate_utf8_view(views.as_ref(), buffers.as_ref())?; } else { @@ -254,6 +263,12 @@ impl BinaryViewArrayGeneric { } } + /// Returns a new [`BinaryViewArrayGeneric`] from a slice of `&T`. + // Note: this can't be `impl From` because Rust does not allow double `AsRef` on it. + pub fn from, P: AsRef<[Option]>>(slice: P) -> Self { + MutableBinaryViewArray::::from(slice).into() + } + /// Creates an empty [`BinaryViewArrayGeneric`], i.e. whose `.len` is zero. #[inline] pub fn new_empty(data_type: DataType) -> Self { @@ -438,6 +453,84 @@ impl BinaryViewArrayGeneric { total_buffer_len: self.total_buffer_len, } } + + #[must_use] + pub fn into_mut(self) -> Either> { + use Either::*; + let is_unique = (Arc::strong_count(&self.buffers) + Arc::weak_count(&self.buffers)) == 1; + + if let Some(bitmap) = self.validity { + match bitmap.into_mut() { + Left(bitmap) => Left(Self::new_unchecked( + self.data_type, + self.views, + self.buffers, + Some(bitmap), + self.total_bytes_len.load(Ordering::Relaxed) as usize, + self.total_buffer_len, + )), + Right(mutable_bitmap) => match (self.views.into_mut(), is_unique) { + (Right(views), true) => Right(MutableBinaryViewArray { + views, + completed_buffers: self.buffers.to_vec(), + in_progress_buffer: vec![], + validity: Some(mutable_bitmap), + phantom: Default::default(), + total_bytes_len: self.total_bytes_len.load(Ordering::Relaxed) as usize, + total_buffer_len: self.total_buffer_len, + }), + (Right(views), false) => Left(Self::new_unchecked( + self.data_type, + views.into(), + self.buffers, + Some(mutable_bitmap.into()), + self.total_bytes_len.load(Ordering::Relaxed) as usize, + self.total_buffer_len, + )), + (Left(views), _) => Left(Self::new_unchecked( + self.data_type, + views, + self.buffers, + Some(mutable_bitmap.into()), + self.total_bytes_len.load(Ordering::Relaxed) as usize, + self.total_buffer_len, + )), + }, + } + } else { + match (self.views.into_mut(), is_unique) { + (Right(views), true) => Right(MutableBinaryViewArray { + views, + completed_buffers: self.buffers.to_vec(), + in_progress_buffer: vec![], + validity: None, + phantom: Default::default(), + total_bytes_len: self.total_bytes_len.load(Ordering::Relaxed) as usize, + total_buffer_len: self.total_buffer_len, + }), + (Right(views), false) => Left(Self::new_unchecked( + self.data_type, + views.into(), + self.buffers, + None, + self.total_bytes_len.load(Ordering::Relaxed) as usize, + self.total_buffer_len, + )), + (Left(views), _) => Left(Self::new_unchecked( + self.data_type, + views, + self.buffers, + None, + self.total_bytes_len.load(Ordering::Relaxed) as usize, + self.total_buffer_len, + )), + } + } + } + + pub fn default_data_type() -> &'static DataType { + T::data_type() + } } pub type BinaryViewArray = BinaryViewArrayGeneric<[u8]>; @@ -500,7 +593,7 @@ impl Array for BinaryViewArrayGeneric { } fn data_type(&self) -> &DataType { - T::dtype() + T::data_type() } fn validity(&self) -> Option<&Bitmap> { diff --git a/src/common/arrow/src/arrow/array/binview/mutable.rs b/src/common/arrow/src/arrow/array/binview/mutable.rs index c73f2b520e31..da9f2e1cf12d 100644 --- a/src/common/arrow/src/arrow/array/binview/mutable.rs +++ b/src/common/arrow/src/arrow/array/binview/mutable.rs @@ -113,7 +113,11 @@ impl MutableBinaryViewArray { &self.views } - pub fn validity(&mut self) -> Option<&mut MutableBitmap> { + pub fn validity(&self) -> Option<&MutableBitmap> { + self.validity.as_ref() + } + + pub fn validity_mut(&mut self) -> Option<&mut MutableBitmap> { self.validity.as_mut() } @@ -175,8 +179,16 @@ impl MutableBinaryViewArray { payload[0..4].copy_from_slice(&len.to_le_bytes()); if len <= 12 { + // | len | prefix | remaining(zero-padded) | + // ^ ^ ^ + // | 4 bytes | 4 bytes | 8 bytes | payload[4..4 + bytes.len()].copy_from_slice(bytes); } else { + // | len | prefix | buffer | offsets | + // ^ ^ ^ ^ + // | 4 bytes | 4 bytes | 4 bytes | 4 bytes | + // + // buffer index + offset -> real binary data self.total_buffer_len += bytes.len(); let required_cap = self.in_progress_buffer.len() + bytes.len(); if self.in_progress_buffer.capacity() < required_cap { @@ -192,6 +204,7 @@ impl MutableBinaryViewArray { let offset = self.in_progress_buffer.len() as u32; self.in_progress_buffer.extend_from_slice(bytes); + // set prefix unsafe { payload[4..8].copy_from_slice(bytes.get_unchecked(0..4)) }; let buffer_idx: u32 = self.completed_buffers.len().try_into().unwrap(); payload[8..12].copy_from_slice(&buffer_idx.to_le_bytes()); @@ -347,12 +360,13 @@ impl MutableBinaryViewArray { let len = v.length; // view layout: + // for no-inlined layout: // length: 4 bytes // prefix: 4 bytes // buffer_index: 4 bytes // offset: 4 bytes - // inlined layout: + // for inlined layout: // length: 4 bytes // data: 12 bytes let bytes = if len <= 12 { @@ -378,6 +392,10 @@ impl MutableBinaryViewArray { pub fn values_iter(&self) -> MutableBinaryViewValueIter { MutableBinaryViewValueIter::new(self) } + + pub fn values(&self) -> Vec<&T> { + self.values_iter().collect() + } } impl MutableBinaryViewArray<[u8]> { @@ -404,7 +422,7 @@ impl> FromIterator> for MutableBinar impl MutableArray for MutableBinaryViewArray { fn data_type(&self) -> &DataType { - T::dtype() + T::data_type() } fn len(&self) -> usize { diff --git a/src/common/arrow/src/arrow/io/parquet/read/deserialize/utils.rs b/src/common/arrow/src/arrow/io/parquet/read/deserialize/utils.rs index 1e1b6106bf63..14ca2ca7bb74 100644 --- a/src/common/arrow/src/arrow/io/parquet/read/deserialize/utils.rs +++ b/src/common/arrow/src/arrow/io/parquet/read/deserialize/utils.rs @@ -116,7 +116,7 @@ impl Pushable<&T> for MutableBinaryViewArray { views.push(view); } - if let Some(bitmap) = self.validity() { + if let Some(bitmap) = self.validity_mut() { bitmap.extend_constant(remaining, true) } } diff --git a/src/common/arrow/tests/it/arrow/array/binview/mod.rs b/src/common/arrow/tests/it/arrow/array/binview/mod.rs new file mode 100644 index 000000000000..32f10baecf39 --- /dev/null +++ b/src/common/arrow/tests/it/arrow/array/binview/mod.rs @@ -0,0 +1,189 @@ +// Copyright 2021 Datafuse Labs +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +mod mutable; +mod mutable_values; +mod to_mutable; + +use std::sync::Arc; + +use databend_common_arrow::arrow::array::Array; +use databend_common_arrow::arrow::array::BinaryViewArray; +use databend_common_arrow::arrow::array::Utf8ViewArray; +use databend_common_arrow::arrow::bitmap::Bitmap; +use databend_common_arrow::arrow::buffer::Buffer; +use databend_common_arrow::arrow::datatypes::DataType; + +#[test] +fn basics_string_view() { + let data = vec![ + Some("hello"), + None, + // larger than 12 bytes. + Some("Databend Cloud is a Cost-Effective alternative to Snowflake."), + ]; + + let array: Utf8ViewArray = data.into_iter().collect(); + + assert_eq!(array.value(0), "hello"); + assert_eq!(array.value(1), ""); + assert_eq!( + array.value(2), + "Databend Cloud is a Cost-Effective alternative to Snowflake." + ); + assert_eq!( + unsafe { array.value_unchecked(2) }, + "Databend Cloud is a Cost-Effective alternative to Snowflake." + ); + assert_eq!( + array.validity(), + Some(&Bitmap::from_u8_slice([0b00000101], 3)) + ); + assert!(array.is_valid(0)); + assert!(!array.is_valid(1)); + assert!(array.is_valid(2)); + + let array2 = Utf8ViewArray::new_unchecked( + DataType::Utf8View, + array.views().clone(), + array.data_buffers().clone(), + array.validity().cloned(), + array.total_bytes_len(), + array.total_buffer_len(), + ); + + assert_eq!(array, array2); + + let array = array.sliced(1, 2); + + assert_eq!(array.value(0), ""); + assert_eq!( + array.value(1), + "Databend Cloud is a Cost-Effective alternative to Snowflake." + ); +} + +#[test] +fn basics_binary_view() { + let data = vec![ + Some(b"hello".to_vec()), + None, + // larger than 12 bytes. + Some(b"Databend Cloud is a Cost-Effective alternative to Snowflake.".to_vec()), + ]; + + let array: BinaryViewArray = data.into_iter().collect(); + + assert_eq!(array.value(0), b"hello"); + assert_eq!(array.value(1), b""); + assert_eq!( + array.value(2), + b"Databend Cloud is a Cost-Effective alternative to Snowflake." + ); + assert_eq!( + unsafe { array.value_unchecked(2) }, + b"Databend Cloud is a Cost-Effective alternative to Snowflake." + ); + assert_eq!( + array.validity(), + Some(&Bitmap::from_u8_slice([0b00000101], 3)) + ); + assert!(array.is_valid(0)); + assert!(!array.is_valid(1)); + assert!(array.is_valid(2)); + + let array2 = BinaryViewArray::new_unchecked( + DataType::BinaryView, + array.views().clone(), + array.data_buffers().clone(), + array.validity().cloned(), + array.total_bytes_len(), + array.total_buffer_len(), + ); + + assert_eq!(array, array2); + + let array = array.sliced(1, 2); + + assert_eq!(array.value(0), b""); + assert_eq!( + array.value(1), + b"Databend Cloud is a Cost-Effective alternative to Snowflake." + ); +} + +#[test] +fn from() { + let array = Utf8ViewArray::from([Some("hello"), Some(" "), None]); + + let a = array.validity().unwrap(); + assert_eq!(a, &Bitmap::from([true, true, false])); + + let array = BinaryViewArray::from([Some(b"hello".to_vec()), Some(b" ".to_vec()), None]); + + let a = array.validity().unwrap(); + assert_eq!(a, &Bitmap::from([true, true, false])); +} + +#[test] +fn from_iter() { + let iter = std::iter::repeat(b"hello").take(2).map(Some); + let a: BinaryViewArray = iter.collect(); + assert_eq!(a.len(), 2); +} + +#[test] +fn with_validity() { + let array = BinaryViewArray::from([Some(b"hello".as_ref()), Some(b" ".as_ref()), None]); + + let array = array.with_validity(None); + + let a = array.validity(); + assert_eq!(a, None); +} + +#[test] +#[should_panic] +fn wrong_data_type() { + let validity = Some(Bitmap::new_zeroed(3)); + BinaryViewArray::try_new(DataType::Int8, Buffer::zeroed(3), Arc::from([]), validity).unwrap(); +} + +#[test] +fn debug() { + let data = vec![Some([1_u8, 2_u8].to_vec()), Some(vec![]), None]; + + let array: BinaryViewArray = data.into_iter().collect(); + + assert_eq!(format!("{array:?}"), "BinaryViewArray[[1, 2], [], None]"); +} + +#[test] +fn rev_iter() { + let array = BinaryViewArray::from([Some("hello".as_bytes()), Some(" ".as_bytes()), None]); + + assert_eq!(array.into_iter().rev().collect::>(), vec![ + None, + Some(" ".as_bytes()), + Some("hello".as_bytes()) + ]); +} + +#[test] +fn iter_nth() { + let array = BinaryViewArray::from([Some("hello"), Some(" "), None]); + + assert_eq!(array.iter().nth(1), Some(Some(" ".as_bytes()))); + assert_eq!(array.iter().nth(10), None); +} diff --git a/src/common/arrow/tests/it/arrow/array/binview/mutable.rs b/src/common/arrow/tests/it/arrow/array/binview/mutable.rs new file mode 100644 index 000000000000..f2b70037cf7c --- /dev/null +++ b/src/common/arrow/tests/it/arrow/array/binview/mutable.rs @@ -0,0 +1,50 @@ +// Copyright 2021 Datafuse Labs +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use databend_common_arrow::arrow::array::Array; +use databend_common_arrow::arrow::array::MutableBinaryViewArray; +use databend_common_arrow::arrow::array::Utf8ViewArray; +use databend_common_arrow::arrow::bitmap::Bitmap; + +#[test] +fn new() { + assert_eq!(MutableBinaryViewArray::<[u8]>::new().len(), 0); + + let a = MutableBinaryViewArray::<[u8]>::with_capacity(2); + assert_eq!(a.len(), 0); + assert_eq!(a.capacity(), 2); +} + +#[test] +fn from_iter() { + let iter = (0..3u8).map(|x| Some(vec![x; x as usize])); + let a: MutableBinaryViewArray<[u8]> = iter.clone().collect(); + let mut v_iter = a.values_iter(); + assert_eq!(v_iter.next(), Some(&[] as &[u8])); + assert_eq!(v_iter.next(), Some(&[1u8] as &[u8])); + assert_eq!(v_iter.next(), Some(&[2u8, 2] as &[u8])); + assert_eq!(a.validity(), None); + + let a = MutableBinaryViewArray::<[u8]>::from_iter(iter); + assert_eq!(a.validity(), None); +} + +#[test] +fn push_null() { + let mut array = MutableBinaryViewArray::new(); + array.push::<&str>(None); + + let array: Utf8ViewArray = array.into(); + assert_eq!(array.validity(), Some(&Bitmap::from([false]))); +} diff --git a/src/common/arrow/tests/it/arrow/array/binview/mutable_values.rs b/src/common/arrow/tests/it/arrow/array/binview/mutable_values.rs new file mode 100644 index 000000000000..0c23a157f65c --- /dev/null +++ b/src/common/arrow/tests/it/arrow/array/binview/mutable_values.rs @@ -0,0 +1,31 @@ +// Copyright 2021 Datafuse Labs +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use databend_common_arrow::arrow::array::MutableArray; +use databend_common_arrow::arrow::array::MutableBinaryViewArray; + +#[test] +fn extend_from_iter() { + let mut b = MutableBinaryViewArray::::new(); + b.extend_trusted_len_values(vec!["a", "b"].into_iter()); + + let a = b.clone(); + b.extend_trusted_len_values(a.values_iter()); + + assert_eq!( + b.as_box(), + MutableBinaryViewArray::::from_values_iter(vec!["a", "b", "a", "b"].into_iter()) + .as_box() + ) +} diff --git a/src/common/arrow/tests/it/arrow/array/binview/to_mutable.rs b/src/common/arrow/tests/it/arrow/array/binview/to_mutable.rs new file mode 100644 index 000000000000..7ee7856ba01d --- /dev/null +++ b/src/common/arrow/tests/it/arrow/array/binview/to_mutable.rs @@ -0,0 +1,46 @@ +// Copyright 2021 Datafuse Labs +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use databend_common_arrow::arrow::array::BinaryViewArray; +use databend_common_arrow::arrow::bitmap::Bitmap; +use databend_common_arrow::arrow::datatypes::DataType; + +#[test] +fn not_shared() { + let array = BinaryViewArray::from([Some("hello"), Some(" "), None]); + assert!(array.into_mut().is_right()); +} + +#[test] +#[allow(clippy::redundant_clone)] +fn shared() { + let validity = Bitmap::from([true]); + let data = vec![ + Some(b"hello".to_vec()), + None, + // larger than 12 bytes. + Some(b"Databend Cloud is a Cost-Effective alternative to Snowflake.".to_vec()), + ]; + + let array: BinaryViewArray = data.into_iter().collect(); + let array2 = BinaryViewArray::new_unchecked( + DataType::BinaryView, + array.views().clone(), + array.data_buffers().clone(), + Some(validity.clone()), + array.total_bytes_len(), + array.total_buffer_len(), + ); + assert!(array2.into_mut().is_left()) +} diff --git a/src/common/arrow/tests/it/arrow/array/mod.rs b/src/common/arrow/tests/it/arrow/array/mod.rs index 66fefaf3ceec..85944735a3e4 100644 --- a/src/common/arrow/tests/it/arrow/array/mod.rs +++ b/src/common/arrow/tests/it/arrow/array/mod.rs @@ -14,6 +14,7 @@ // limitations under the License. mod binary; +mod binview; mod boolean; mod dictionary; mod equal;