Skip to content

Commit

Permalink
add tests for string_view type
Browse files Browse the repository at this point in the history
  • Loading branch information
ariesdevil committed Mar 1, 2024
1 parent 2d920aa commit 31939cb
Show file tree
Hide file tree
Showing 10 changed files with 461 additions and 9 deletions.
2 changes: 1 addition & 1 deletion src/common/arrow/src/arrow/array/binview/ffi.rs
Original file line number Diff line number Diff line change
Expand Up @@ -79,7 +79,7 @@ impl<T: ViewType + ?Sized, A: ffi::ArrowArrayRef> FromFfi<A> for BinaryViewArray
let validity = unsafe { array.validity() }?;
let views = unsafe { array.buffer::<View>(1) }?;

// 2 - validity + views
// n_buffers - 2, 2 means validity + views
let n_buffers = array.n_buffers();
let mut remaining_buffers = n_buffers - 2;
if remaining_buffers <= 1 {
Expand Down
24 changes: 24 additions & 0 deletions src/common/arrow/src/arrow/array/binview/from.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
// Copyright 2021 Datafuse Labs
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

use crate::arrow::array::BinaryViewArrayGeneric;
use crate::arrow::array::MutableBinaryViewArray;
use crate::arrow::array::ViewType;

impl<T: ViewType + ?Sized, P: AsRef<T>> FromIterator<Option<P>> for BinaryViewArrayGeneric<T> {
#[inline]
fn from_iter<I: IntoIterator<Item = Option<P>>>(iter: I) -> Self {
MutableBinaryViewArray::<T>::from_iter(iter).into()
}
}
101 changes: 97 additions & 4 deletions src/common/arrow/src/arrow/array/binview/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@

mod ffi;
pub(crate) mod fmt;
mod from;
mod iterator;
mod mutable;
mod view;
Expand All @@ -23,6 +24,7 @@ mod private {
pub trait Sealed: Send + Sync {}

impl Sealed for str {}

impl Sealed for [u8] {}
}

Expand All @@ -33,6 +35,7 @@ use std::sync::atomic::AtomicU64;
use std::sync::atomic::Ordering;
use std::sync::Arc;

use either::Either;
pub use iterator::BinaryViewValueIter;
pub use mutable::MutableBinaryViewArray;
use private::Sealed;
Expand Down Expand Up @@ -70,7 +73,7 @@ pub trait ViewType: Sealed + 'static + PartialEq + AsRef<Self> {
#[allow(clippy::wrong_self_convention)]
fn into_owned(&self) -> Self::Owned;

fn dtype() -> &'static DataType;
fn data_type() -> &'static DataType;
}

impl ViewType for str {
Expand All @@ -92,7 +95,7 @@ impl ViewType for str {
self.to_string()
}

fn dtype() -> &'static DataType {
fn data_type() -> &'static DataType {
&UTF8_VIEW_TYPE
}
}
Expand All @@ -116,7 +119,7 @@ impl ViewType for [u8] {
self.to_vec()
}

fn dtype() -> &'static DataType {
fn data_type() -> &'static DataType {
&BIN_VIEW_TYPE
}
}
Expand Down Expand Up @@ -157,6 +160,7 @@ impl<T: ViewType + ?Sized> Clone for BinaryViewArrayGeneric<T> {
}

unsafe impl<T: ViewType + ?Sized> Send for BinaryViewArrayGeneric<T> {}

unsafe impl<T: ViewType + ?Sized> Sync for BinaryViewArrayGeneric<T> {}

fn buffers_into_raw<T>(buffers: &[Buffer<T>]) -> Arc<[(*const T, usize)]> {
Expand Down Expand Up @@ -233,6 +237,11 @@ impl<T: ViewType + ?Sized> BinaryViewArrayGeneric<T> {
buffers: Arc<[Buffer<u8>]>,
validity: Option<Bitmap>,
) -> Result<Self> {
if data_type.to_physical_type() != Self::default_data_type().to_physical_type() {
return Err(Error::oos(
"BinaryViewArray can only be initialized with DataType::BinaryView or DataType::Utf8View",
));
}
if T::IS_UTF8 {
validate_utf8_view(views.as_ref(), buffers.as_ref())?;
} else {
Expand All @@ -254,6 +263,12 @@ impl<T: ViewType + ?Sized> BinaryViewArrayGeneric<T> {
}
}

/// Returns a new [`BinaryViewArrayGeneric`] from a slice of `&T`.
// Note: this can't be `impl From` because Rust does not allow double `AsRef` on it.
pub fn from<V: AsRef<T>, P: AsRef<[Option<V>]>>(slice: P) -> Self {
MutableBinaryViewArray::<T>::from(slice).into()
}

/// Creates an empty [`BinaryViewArrayGeneric`], i.e. whose `.len` is zero.
#[inline]
pub fn new_empty(data_type: DataType) -> Self {
Expand Down Expand Up @@ -438,6 +453,84 @@ impl<T: ViewType + ?Sized> BinaryViewArrayGeneric<T> {
total_buffer_len: self.total_buffer_len,
}
}

#[must_use]
pub fn into_mut(self) -> Either<Self, MutableBinaryViewArray<T>> {
use Either::*;
let is_unique = (Arc::strong_count(&self.buffers) + Arc::weak_count(&self.buffers)) == 1;

if let Some(bitmap) = self.validity {
match bitmap.into_mut() {
Left(bitmap) => Left(Self::new_unchecked(
self.data_type,
self.views,
self.buffers,
Some(bitmap),
self.total_bytes_len.load(Ordering::Relaxed) as usize,
self.total_buffer_len,
)),
Right(mutable_bitmap) => match (self.views.into_mut(), is_unique) {
(Right(views), true) => Right(MutableBinaryViewArray {
views,
completed_buffers: self.buffers.to_vec(),
in_progress_buffer: vec![],
validity: Some(mutable_bitmap),
phantom: Default::default(),
total_bytes_len: self.total_bytes_len.load(Ordering::Relaxed) as usize,
total_buffer_len: self.total_buffer_len,
}),
(Right(views), false) => Left(Self::new_unchecked(
self.data_type,
views.into(),
self.buffers,
Some(mutable_bitmap.into()),
self.total_bytes_len.load(Ordering::Relaxed) as usize,
self.total_buffer_len,
)),
(Left(views), _) => Left(Self::new_unchecked(
self.data_type,
views,
self.buffers,
Some(mutable_bitmap.into()),
self.total_bytes_len.load(Ordering::Relaxed) as usize,
self.total_buffer_len,
)),
},
}
} else {
match (self.views.into_mut(), is_unique) {
(Right(views), true) => Right(MutableBinaryViewArray {
views,
completed_buffers: self.buffers.to_vec(),
in_progress_buffer: vec![],
validity: None,
phantom: Default::default(),
total_bytes_len: self.total_bytes_len.load(Ordering::Relaxed) as usize,
total_buffer_len: self.total_buffer_len,
}),
(Right(views), false) => Left(Self::new_unchecked(
self.data_type,
views.into(),
self.buffers,
None,
self.total_bytes_len.load(Ordering::Relaxed) as usize,
self.total_buffer_len,
)),
(Left(views), _) => Left(Self::new_unchecked(
self.data_type,
views,
self.buffers,
None,
self.total_bytes_len.load(Ordering::Relaxed) as usize,
self.total_buffer_len,
)),
}
}
}

pub fn default_data_type() -> &'static DataType {
T::data_type()
}
}

pub type BinaryViewArray = BinaryViewArrayGeneric<[u8]>;
Expand Down Expand Up @@ -500,7 +593,7 @@ impl<T: ViewType + ?Sized> Array for BinaryViewArrayGeneric<T> {
}

fn data_type(&self) -> &DataType {
T::dtype()
T::data_type()
}

fn validity(&self) -> Option<&Bitmap> {
Expand Down
24 changes: 21 additions & 3 deletions src/common/arrow/src/arrow/array/binview/mutable.rs
Original file line number Diff line number Diff line change
Expand Up @@ -113,7 +113,11 @@ impl<T: ViewType + ?Sized> MutableBinaryViewArray<T> {
&self.views
}

pub fn validity(&mut self) -> Option<&mut MutableBitmap> {
pub fn validity(&self) -> Option<&MutableBitmap> {
self.validity.as_ref()
}

pub fn validity_mut(&mut self) -> Option<&mut MutableBitmap> {
self.validity.as_mut()
}

Expand Down Expand Up @@ -175,8 +179,16 @@ impl<T: ViewType + ?Sized> MutableBinaryViewArray<T> {
payload[0..4].copy_from_slice(&len.to_le_bytes());

if len <= 12 {
// | len | prefix | remaining(zero-padded) |
// ^ ^ ^
// | 4 bytes | 4 bytes | 8 bytes |
payload[4..4 + bytes.len()].copy_from_slice(bytes);
} else {
// | len | prefix | buffer | offsets |
// ^ ^ ^ ^
// | 4 bytes | 4 bytes | 4 bytes | 4 bytes |
//
// buffer index + offset -> real binary data
self.total_buffer_len += bytes.len();
let required_cap = self.in_progress_buffer.len() + bytes.len();
if self.in_progress_buffer.capacity() < required_cap {
Expand All @@ -192,6 +204,7 @@ impl<T: ViewType + ?Sized> MutableBinaryViewArray<T> {
let offset = self.in_progress_buffer.len() as u32;
self.in_progress_buffer.extend_from_slice(bytes);

// set prefix
unsafe { payload[4..8].copy_from_slice(bytes.get_unchecked(0..4)) };
let buffer_idx: u32 = self.completed_buffers.len().try_into().unwrap();
payload[8..12].copy_from_slice(&buffer_idx.to_le_bytes());
Expand Down Expand Up @@ -347,12 +360,13 @@ impl<T: ViewType + ?Sized> MutableBinaryViewArray<T> {
let len = v.length;

// view layout:
// for no-inlined layout:
// length: 4 bytes
// prefix: 4 bytes
// buffer_index: 4 bytes
// offset: 4 bytes

// inlined layout:
// for inlined layout:
// length: 4 bytes
// data: 12 bytes
let bytes = if len <= 12 {
Expand All @@ -378,6 +392,10 @@ impl<T: ViewType + ?Sized> MutableBinaryViewArray<T> {
pub fn values_iter(&self) -> MutableBinaryViewValueIter<T> {
MutableBinaryViewValueIter::new(self)
}

pub fn values(&self) -> Vec<&T> {
self.values_iter().collect()
}
}

impl MutableBinaryViewArray<[u8]> {
Expand All @@ -404,7 +422,7 @@ impl<T: ViewType + ?Sized, P: AsRef<T>> FromIterator<Option<P>> for MutableBinar

impl<T: ViewType + ?Sized> MutableArray for MutableBinaryViewArray<T> {
fn data_type(&self) -> &DataType {
T::dtype()
T::data_type()
}

fn len(&self) -> usize {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -116,7 +116,7 @@ impl<T: ViewType + ?Sized> Pushable<&T> for MutableBinaryViewArray<T> {
views.push(view);
}

if let Some(bitmap) = self.validity() {
if let Some(bitmap) = self.validity_mut() {
bitmap.extend_constant(remaining, true)
}
}
Expand Down
Loading

0 comments on commit 31939cb

Please sign in to comment.