From 122c8abf926cb3681a0ad69d02d3610889c02e21 Mon Sep 17 00:00:00 2001 From: Yijun Zhao Date: Wed, 21 Feb 2024 18:35:11 +0800 Subject: [PATCH] add tests for string_view type --- .../arrow/src/arrow/array/binview/from.rs | 24 ++++ .../arrow/src/arrow/array/binview/mod.rs | 11 +- .../arrow/src/arrow/array/binview/mutable.rs | 12 +- .../arrow/tests/it/arrow/array/binview/mod.rs | 130 ++++++++++++++++++ src/common/arrow/tests/it/arrow/array/mod.rs | 1 + 5 files changed, 176 insertions(+), 2 deletions(-) create mode 100644 src/common/arrow/src/arrow/array/binview/from.rs create mode 100644 src/common/arrow/tests/it/arrow/array/binview/mod.rs diff --git a/src/common/arrow/src/arrow/array/binview/from.rs b/src/common/arrow/src/arrow/array/binview/from.rs new file mode 100644 index 0000000000000..368d849d10c58 --- /dev/null +++ b/src/common/arrow/src/arrow/array/binview/from.rs @@ -0,0 +1,24 @@ +// Copyright 2021 Datafuse Labs +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use crate::arrow::array::BinaryViewArrayGeneric; +use crate::arrow::array::MutableBinaryViewArray; +use crate::arrow::array::ViewType; + +impl> FromIterator> for BinaryViewArrayGeneric { + #[inline] + fn from_iter>>(iter: I) -> Self { + MutableBinaryViewArray::::from_iter(iter).into() + } +} diff --git a/src/common/arrow/src/arrow/array/binview/mod.rs b/src/common/arrow/src/arrow/array/binview/mod.rs index 3bf310087b826..17b8ee72b472a 100644 --- a/src/common/arrow/src/arrow/array/binview/mod.rs +++ b/src/common/arrow/src/arrow/array/binview/mod.rs @@ -15,6 +15,7 @@ mod ffi; pub(crate) mod fmt; +mod from; mod iterator; mod mutable; mod view; @@ -23,6 +24,7 @@ mod private { pub trait Sealed: Send + Sync {} impl Sealed for str {} + impl Sealed for [u8] {} } @@ -157,6 +159,7 @@ impl Clone for BinaryViewArrayGeneric { } unsafe impl Send for BinaryViewArrayGeneric {} + unsafe impl Sync for BinaryViewArrayGeneric {} fn buffers_into_raw(buffers: &[Buffer]) -> Arc<[(*const T, usize)]> { @@ -254,6 +257,12 @@ impl BinaryViewArrayGeneric { } } + /// Returns a new [`BinaryViewArrayGeneric`] from a slice of `&T`. + // Note: this can't be `impl From` because Rust does not allow double `AsRef` on it. + pub fn from, P: AsRef<[Option]>>(slice: P) -> Self { + MutableBinaryViewArray::::from(slice).into() + } + /// Creates an empty [`BinaryViewArrayGeneric`], i.e. whose `.len` is zero. #[inline] pub fn new_empty(data_type: DataType) -> Self { @@ -323,7 +332,7 @@ impl BinaryViewArrayGeneric { BinaryViewValueIter::new(self) } - pub fn len_iter(&self) -> impl Iterator + '_ { + pub fn len_iter(&self) -> impl Iterator + '_ { self.views.iter().map(|v| v.length) } diff --git a/src/common/arrow/src/arrow/array/binview/mutable.rs b/src/common/arrow/src/arrow/array/binview/mutable.rs index c73f2b520e319..5d2886f8bc25d 100644 --- a/src/common/arrow/src/arrow/array/binview/mutable.rs +++ b/src/common/arrow/src/arrow/array/binview/mutable.rs @@ -175,8 +175,16 @@ impl MutableBinaryViewArray { payload[0..4].copy_from_slice(&len.to_le_bytes()); if len <= 12 { + // | len | prefix | remaining(zero-padded) | + // ^ ^ ^ + // | 4 bytes | 4 bytes | 8 bytes | payload[4..4 + bytes.len()].copy_from_slice(bytes); } else { + // | len | prefix | buffer | offsets | + // ^ ^ ^ ^ + // | 4 bytes | 4 bytes | 4 bytes | 4 bytes | + // + // buffer index + offset -> real binary data self.total_buffer_len += bytes.len(); let required_cap = self.in_progress_buffer.len() + bytes.len(); if self.in_progress_buffer.capacity() < required_cap { @@ -192,6 +200,7 @@ impl MutableBinaryViewArray { let offset = self.in_progress_buffer.len() as u32; self.in_progress_buffer.extend_from_slice(bytes); + // set prefix unsafe { payload[4..8].copy_from_slice(bytes.get_unchecked(0..4)) }; let buffer_idx: u32 = self.completed_buffers.len().try_into().unwrap(); payload[8..12].copy_from_slice(&buffer_idx.to_le_bytes()); @@ -347,12 +356,13 @@ impl MutableBinaryViewArray { let len = v.length; // view layout: + // for no-inlined layout: // length: 4 bytes // prefix: 4 bytes // buffer_index: 4 bytes // offset: 4 bytes - // inlined layout: + // for inlined layout: // length: 4 bytes // data: 12 bytes let bytes = if len <= 12 { diff --git a/src/common/arrow/tests/it/arrow/array/binview/mod.rs b/src/common/arrow/tests/it/arrow/array/binview/mod.rs new file mode 100644 index 0000000000000..46c197e4f5254 --- /dev/null +++ b/src/common/arrow/tests/it/arrow/array/binview/mod.rs @@ -0,0 +1,130 @@ +// Copyright 2021 Datafuse Labs +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use databend_common_arrow::arrow::array::Array; +use databend_common_arrow::arrow::array::BinaryViewArray; +use databend_common_arrow::arrow::array::Utf8ViewArray; +use databend_common_arrow::arrow::bitmap::Bitmap; +use databend_common_arrow::arrow::datatypes::DataType; + +#[test] +fn basics_string_view() { + let data = vec![ + Some("hello"), + None, + // larger than 12 bytes. + Some("Databend Cloud is a Cost-Effective alternative to Snowflake."), + ]; + + let array: Utf8ViewArray = data.into_iter().collect(); + + assert_eq!(array.value(0), "hello"); + assert_eq!(array.value(1), ""); + assert_eq!( + array.value(2), + "Databend Cloud is a Cost-Effective alternative to Snowflake." + ); + assert_eq!( + unsafe { array.value_unchecked(2) }, + "Databend Cloud is a Cost-Effective alternative to Snowflake." + ); + assert_eq!( + array.validity(), + Some(&Bitmap::from_u8_slice([0b00000101], 3)) + ); + assert!(array.is_valid(0)); + assert!(!array.is_valid(1)); + assert!(array.is_valid(2)); + + let array2 = Utf8ViewArray::new_unchecked( + DataType::Utf8View, + array.views().clone(), + array.data_buffers().clone(), + array.validity().cloned(), + array.total_bytes_len(), + array.total_buffer_len(), + ); + + assert_eq!(array, array2); + + let array = array.sliced(1, 2); + + assert_eq!(array.value(0), ""); + assert_eq!( + array.value(1), + "Databend Cloud is a Cost-Effective alternative to Snowflake." + ); +} + +#[test] +fn basics_binary_view() { + let data = vec![ + Some(b"hello".to_vec()), + None, + // larger than 12 bytes. + Some(b"Databend Cloud is a Cost-Effective alternative to Snowflake.".to_vec()), + ]; + + let array: BinaryViewArray = data.into_iter().collect(); + + assert_eq!(array.value(0), b"hello"); + assert_eq!(array.value(1), b""); + assert_eq!( + array.value(2), + b"Databend Cloud is a Cost-Effective alternative to Snowflake." + ); + assert_eq!( + unsafe { array.value_unchecked(2) }, + b"Databend Cloud is a Cost-Effective alternative to Snowflake." + ); + assert_eq!( + array.validity(), + Some(&Bitmap::from_u8_slice([0b00000101], 3)) + ); + assert!(array.is_valid(0)); + assert!(!array.is_valid(1)); + assert!(array.is_valid(2)); + + let array2 = BinaryViewArray::new_unchecked( + DataType::Utf8View, + array.views().clone(), + array.data_buffers().clone(), + array.validity().cloned(), + array.total_bytes_len(), + array.total_buffer_len(), + ); + + assert_eq!(array, array2); + + let array = array.sliced(1, 2); + + assert_eq!(array.value(0), b""); + assert_eq!( + array.value(1), + b"Databend Cloud is a Cost-Effective alternative to Snowflake." + ); +} + +#[test] +fn from() { + let array = Utf8ViewArray::from([Some("hello"), Some(" "), None]); + + let a = array.validity().unwrap(); + assert_eq!(a, &Bitmap::from([true, true, false])); + + let array = BinaryViewArray::from([Some(b"hello".to_vec()), Some(b" ".to_vec()), None]); + + let a = array.validity().unwrap(); + assert_eq!(a, &Bitmap::from([true, true, false])); +} diff --git a/src/common/arrow/tests/it/arrow/array/mod.rs b/src/common/arrow/tests/it/arrow/array/mod.rs index 66fefaf3ceecf..85944735a3e43 100644 --- a/src/common/arrow/tests/it/arrow/array/mod.rs +++ b/src/common/arrow/tests/it/arrow/array/mod.rs @@ -14,6 +14,7 @@ // limitations under the License. mod binary; +mod binview; mod boolean; mod dictionary; mod equal;