From e741c29a68e708715f82a3c58556f1fa5c744c63 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies Date: Wed, 8 Mar 2023 14:39:55 +0000 Subject: [PATCH] Add ArrayDataLayout (#1799) --- arrow-buffer/src/buffer/boolean.rs | 5 + arrow-buffer/src/buffer/offset.rs | 15 ++ arrow-buffer/src/buffer/run.rs | 10 ++ arrow-buffer/src/buffer/scalar.rs | 15 ++ arrow-data/src/data/boolean.rs | 139 +++++++++++++++ arrow-data/src/data/buffers.rs | 11 +- arrow-data/src/data/bytes.rs | 220 ++++++++++++++++++++++- arrow-data/src/data/dictionary.rs | 128 +++++++++++++- arrow-data/src/data/list.rs | 208 ++++++++++++++++++++-- arrow-data/src/data/mod.rs | 271 +++++++++++++++++------------ arrow-data/src/data/null.rs | 104 +++++++++++ arrow-data/src/data/primitive.rs | 120 ++++++++++++- arrow-data/src/data/run.rs | 159 +++++++++++++++-- arrow-data/src/data/struct.rs | 50 +++++- arrow-data/src/data/types.rs | 14 +- arrow-data/src/data/union.rs | 103 ++++++++++- 16 files changed, 1395 insertions(+), 177 deletions(-) create mode 100644 arrow-data/src/data/boolean.rs create mode 100644 arrow-data/src/data/null.rs diff --git a/arrow-buffer/src/buffer/boolean.rs b/arrow-buffer/src/buffer/boolean.rs index 0239111cbafe..68d8695fe3b7 100644 --- a/arrow-buffer/src/buffer/boolean.rs +++ b/arrow-buffer/src/buffer/boolean.rs @@ -139,4 +139,9 @@ impl BooleanBuffer { pub fn inner(&self) -> &Buffer { &self.buffer } + + /// Returns the inner [`Buffer`] + pub fn into_inner(self) -> Buffer { + self.buffer + } } diff --git a/arrow-buffer/src/buffer/offset.rs b/arrow-buffer/src/buffer/offset.rs index a80c3c7ecb69..ae559b9672d4 100644 --- a/arrow-buffer/src/buffer/offset.rs +++ b/arrow-buffer/src/buffer/offset.rs @@ -39,6 +39,21 @@ impl OffsetBuffer { let buffer = MutableBuffer::from_len_zeroed(std::mem::size_of::()); Self(buffer.into_buffer().into()) } + + /// Returns the inner [`ScalarBuffer`] + pub fn inner(&self) -> &ScalarBuffer { + &self.0 + } + + /// Returns the inner [`Buffer`] + pub fn into_inner(self) -> ScalarBuffer { + self.0 + } + + /// Returns a zero-copy slice of this buffer with length `len` and starting at `offset` + pub fn slice(&self, offset: usize, len: usize) -> Self { + Self(self.0.slice(offset, len.saturating_add(1))) + } } impl Deref for OffsetBuffer { diff --git a/arrow-buffer/src/buffer/run.rs b/arrow-buffer/src/buffer/run.rs index 04cfbdb8f14e..743561360159 100644 --- a/arrow-buffer/src/buffer/run.rs +++ b/arrow-buffer/src/buffer/run.rs @@ -157,4 +157,14 @@ where len, } } + + /// Returns the inner [`ScalarBuffer`] + pub fn inner(&self) -> &ScalarBuffer { + &self.run_ends + } + + /// Returns the inner [`ScalarBuffer`] + pub fn into_inner(self) -> ScalarBuffer { + self.run_ends + } } diff --git a/arrow-buffer/src/buffer/scalar.rs b/arrow-buffer/src/buffer/scalar.rs index 01a64633f532..593af06d3d50 100644 --- a/arrow-buffer/src/buffer/scalar.rs +++ b/arrow-buffer/src/buffer/scalar.rs @@ -50,6 +50,21 @@ impl ScalarBuffer { let byte_len = len.checked_mul(size).expect("length overflow"); buffer.slice_with_length(byte_offset, byte_len).into() } + + /// Returns a zero-copy slice of this buffer with length `len` and starting at `offset` + pub fn slice(&self, offset: usize, len: usize) -> Self { + Self::new(self.buffer.clone(), offset, len) + } + + /// Returns the inner [`Buffer`] + pub fn inner(&self) -> &Buffer { + &self.buffer + } + + /// Returns the inner [`Buffer`] + pub fn into_inner(self) -> Buffer { + self.buffer + } } impl Deref for ScalarBuffer { diff --git a/arrow-data/src/data/boolean.rs b/arrow-data/src/data/boolean.rs new file mode 100644 index 000000000000..f8a4ebeba457 --- /dev/null +++ b/arrow-data/src/data/boolean.rs @@ -0,0 +1,139 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use crate::data::types::PhysicalType; +use crate::data::ArrayDataLayout; +use crate::{ArrayDataBuilder, Buffers}; +use arrow_buffer::buffer::{BooleanBuffer, NullBuffer}; +use arrow_schema::DataType; + +#[derive(Debug, Clone)] +pub struct BooleanArrayData { + data_type: DataType, + values: BooleanBuffer, + nulls: Option, +} + +impl BooleanArrayData { + /// Create a new [`BooleanArrayData`] + /// + /// # Panics + /// + /// Panics if + /// - `nulls` and `values` are different lengths + /// - `data_type` is not compatible with `T` + pub fn new( + data_type: DataType, + values: BooleanBuffer, + nulls: Option, + ) -> Self { + let physical = PhysicalType::from(&data_type); + assert_eq!( + physical, PhysicalType::Boolean, + "Illegal physical type for BooleanArrayData of datatype {:?}, expected {:?} got {:?}", + data_type, + PhysicalType::Boolean, + physical + ); + + if let Some(n) = nulls.as_ref() { + assert_eq!(values.len(), n.len()) + } + Self { + data_type, + values, + nulls, + } + } + + /// Create a new [`BooleanArrayData`] + /// + /// # Safety + /// + /// - `nulls` and `values` are the same lengths + /// - `PhysicalType::from(&data_type) == PhysicalType::Boolean` + pub unsafe fn new_unchecked( + data_type: DataType, + values: BooleanBuffer, + nulls: Option, + ) -> Self { + Self { + data_type, + values, + nulls, + } + } + + /// Creates a new [`BooleanArrayData`] from raw buffers + /// + /// # Safety + /// + /// See [`BooleanArrayData::new_unchecked`] + pub(crate) unsafe fn from_raw(builder: ArrayDataBuilder) -> Self { + let values = builder.buffers.into_iter().next().unwrap(); + let values = BooleanBuffer::new(values, builder.offset, builder.len); + Self { + values, + data_type: builder.data_type, + nulls: builder.nulls, + } + } + + /// Returns the null buffer if any + #[inline] + pub fn nulls(&self) -> Option<&NullBuffer> { + self.nulls.as_ref() + } + + /// Returns the boolean values + #[inline] + pub fn values(&self) -> &BooleanBuffer { + &self.values + } + + /// Returns the data type of this array + #[inline] + pub fn data_type(&self) -> &DataType { + &self.data_type + } + + /// Returns the underlying parts of this [`BooleanArrayData`] + pub fn into_parts(self) -> (DataType, BooleanBuffer, Option) { + (self.data_type, self.values, self.nulls) + } + + /// Returns a zero-copy slice of this array + pub fn slice(&self, offset: usize, len: usize) -> Self { + Self { + data_type: self.data_type.clone(), + values: self.values.slice(offset, len), + nulls: self.nulls.as_ref().map(|x| x.slice(offset, len)), + } + } + + /// Returns an [`ArrayDataLayout`] representation of this + pub(crate) fn layout(&self) -> ArrayDataLayout<'_> { + ArrayDataLayout { + data_type: &self.data_type, + len: self.values.len(), + offset: self.values.offset(), + nulls: self.nulls.as_ref(), + buffers: Buffers::one(self.values().inner()), + child_data: &[], + } + } +} diff --git a/arrow-data/src/data/buffers.rs b/arrow-data/src/data/buffers.rs index 3b57bfe0e23c..8a498d319aae 100644 --- a/arrow-data/src/data/buffers.rs +++ b/arrow-data/src/data/buffers.rs @@ -25,7 +25,6 @@ pub struct Buffers<'a>([Option<&'a Buffer>; 2]); impl<'a> Buffers<'a> { /// Temporary will be removed once ArrayData does not store `Vec` directly (#3769) - #[inline] pub(crate) fn from_slice(a: &'a [Buffer]) -> Self { match a.len() { 0 => Self([None, None]), @@ -34,6 +33,16 @@ impl<'a> Buffers<'a> { } } + #[inline] + pub(crate) fn one(b: &'a Buffer) -> Self { + Self([Some(b), None]) + } + + #[inline] + pub(crate) fn two(a: &'a Buffer, b: &'a Buffer) -> Self { + Self([Some(a), Some(b)]) + } + /// Returns the number of [`Buffer`] in this collection #[inline] pub fn len(&self) -> usize { diff --git a/arrow-data/src/data/bytes.rs b/arrow-data/src/data/bytes.rs index 521c1959aaa1..80198b395edb 100644 --- a/arrow-data/src/data/bytes.rs +++ b/arrow-data/src/data/bytes.rs @@ -16,7 +16,9 @@ // under the License. use crate::data::types::{BytesType, OffsetType}; -use arrow_buffer::buffer::{NullBuffer, ScalarBuffer}; +use crate::data::ArrayDataLayout; +use crate::{ArrayDataBuilder, Buffers}; +use arrow_buffer::buffer::{NullBuffer, OffsetBuffer, ScalarBuffer}; use arrow_buffer::{ArrowNativeType, Buffer}; use arrow_schema::DataType; use std::marker::PhantomData; @@ -194,6 +196,23 @@ impl private::BytesOffsetSealed for i64 { } } +/// Applies op to each variant of [`ArrayDataBytes`] +#[macro_export] +macro_rules! bytes_op { + ($array:ident, $op:block) => { + match $array { + ArrayDataBytes::Binary($array) => match $array { + ArrayDataBytesOffset::Small($array) => $op + ArrayDataBytesOffset::Large($array) => $op + } + ArrayDataBytes::Utf8($array) => match $array { + ArrayDataBytesOffset::Small($array) => $op + ArrayDataBytesOffset::Large($array) => $op + } + } + }; +} + /// An enumeration of the types of [`ArrayDataBytesOffset`] #[derive(Debug, Clone)] pub enum ArrayDataBytes { @@ -215,6 +234,48 @@ impl ArrayDataBytes { ) -> Option> { O::downcast(B::downcast(self)?) } + + /// Returns a zero-copy slice of this array + pub fn slice(&self, offset: usize, len: usize) -> Self { + let s = self; + bytes_op!(s, { s.slice(offset, len).into() }) + } + + /// Returns an [`ArrayDataLayout`] representation of this + pub(crate) fn layout(&self) -> ArrayDataLayout<'_> { + let s = self; + bytes_op!(s, { s.layout() }) + } + + /// Creates a new [`ArrayDataBytes`] from raw buffers + /// + /// # Safety + /// + /// See [`BytesArrayData::new_unchecked`] + pub(crate) unsafe fn from_raw( + builder: ArrayDataBuilder, + offset: OffsetType, + bytes: BytesType, + ) -> Self { + match bytes { + BytesType::Binary => Self::Binary(match offset { + OffsetType::Int32 => { + ArrayDataBytesOffset::Small(BytesArrayData::from_raw(builder)) + } + OffsetType::Int64 => { + ArrayDataBytesOffset::Large(BytesArrayData::from_raw(builder)) + } + }), + BytesType::Utf8 => Self::Utf8(match offset { + OffsetType::Int32 => { + ArrayDataBytesOffset::Small(BytesArrayData::from_raw(builder)) + } + OffsetType::Int64 => { + ArrayDataBytesOffset::Large(BytesArrayData::from_raw(builder)) + } + }), + } + } } /// An enumeration of the types of [`BytesArrayData`] @@ -243,9 +304,9 @@ impl From> for ArrayData #[derive(Debug)] pub struct BytesArrayData { data_type: DataType, - nulls: Option, - offsets: ScalarBuffer, + offsets: OffsetBuffer, values: Buffer, + nulls: Option, phantom: PhantomData, } @@ -271,7 +332,7 @@ impl BytesArrayData { /// - `data_type` must be valid for this layout pub unsafe fn new_unchecked( data_type: DataType, - offsets: ScalarBuffer, + offsets: OffsetBuffer, values: Buffer, nulls: Option, ) -> Self { @@ -284,6 +345,46 @@ impl BytesArrayData { } } + /// Creates a new [`BytesArrayData`] from an [`ArrayDataBuilder`] + /// + /// # Safety + /// + /// See [`Self::new_unchecked`] + pub(crate) unsafe fn from_raw(builder: ArrayDataBuilder) -> Self { + let mut iter = builder.buffers.into_iter(); + let offsets = iter.next().unwrap(); + let values = iter.next().unwrap(); + + let offsets = match builder.len { + 0 => OffsetBuffer::new_empty(), + _ => OffsetBuffer::new_unchecked(ScalarBuffer::new( + offsets, + builder.offset, + builder.len + 1, + )), + }; + + Self { + values, + offsets, + data_type: builder.data_type, + nulls: builder.nulls, + phantom: Default::default(), + } + } + + /// Returns the length + #[inline] + pub fn len(&self) -> usize { + self.offsets.len().wrapping_sub(1) + } + + /// Returns true if this array is empty + #[inline] + pub fn is_empty(&self) -> bool { + self.offsets.len() <= 1 + } + /// Returns the raw byte data #[inline] pub fn values(&self) -> &B { @@ -294,13 +395,13 @@ impl BytesArrayData { /// Returns the offsets #[inline] - pub fn offsets(&self) -> &[O] { + pub fn offsets(&self) -> &OffsetBuffer { &self.offsets } /// Returns the null buffer if any #[inline] - pub fn null_buffer(&self) -> Option<&NullBuffer> { + pub fn nulls(&self) -> Option<&NullBuffer> { self.nulls.as_ref() } @@ -309,14 +410,44 @@ impl BytesArrayData { pub fn data_type(&self) -> &DataType { &self.data_type } + + /// Returns the underlying parts of this [`BytesArrayData`] + pub fn into_parts(self) -> (DataType, OffsetBuffer, Buffer, Option) { + (self.data_type, self.offsets, self.values, self.nulls) + } + + /// Returns a zero-copy slice of this array + pub fn slice(&self, offset: usize, len: usize) -> Self { + Self { + values: self.values.clone(), + offsets: self.offsets.slice(offset, len), + data_type: self.data_type.clone(), + nulls: self.nulls().as_ref().map(|x| x.slice(offset, len)), + phantom: Default::default(), + } + } + + /// Returns an [`ArrayDataLayout`] representation of this + pub(crate) fn layout(&self) -> ArrayDataLayout<'_> { + ArrayDataLayout { + data_type: &self.data_type, + len: self.offsets.len().wrapping_sub(1), + offset: 0, + nulls: self.nulls.as_ref(), + buffers: Buffers::two(self.offsets.inner().inner(), &self.values), + child_data: &[], + } + } } /// ArrayData for [fixed-size arrays](https://arrow.apache.org/docs/format/Columnar.html#fixed-size-primitive-layout) of bytes #[derive(Debug, Clone)] pub struct FixedSizeBinaryArrayData { data_type: DataType, - nulls: Option, + len: usize, + element_size: usize, values: Buffer, + nulls: Option, } impl FixedSizeBinaryArrayData { @@ -325,9 +456,11 @@ impl FixedSizeBinaryArrayData { /// # Safety /// /// - `data_type` must be valid for this layout - /// - `nulls.len() == values.len() / element_size` + /// - `nulls.len() == values.len() / element_size == len` pub unsafe fn new_unchecked( data_type: DataType, + len: usize, + element_size: usize, values: Buffer, nulls: Option, ) -> Self { @@ -335,9 +468,46 @@ impl FixedSizeBinaryArrayData { data_type, nulls, values, + len, + element_size, } } + /// Creates a new [`FixedSizeBinaryArrayData`] from raw buffers + /// + /// # Safety + /// + /// See [`FixedSizeBinaryArrayData::new_unchecked`] + pub(crate) unsafe fn from_raw(builder: ArrayDataBuilder, size: usize) -> Self { + let values = builder.buffers[0] + .slice_with_length(builder.offset * size, builder.len * size); + Self { + values, + data_type: builder.data_type, + len: builder.len, + element_size: size, + nulls: builder.nulls, + } + } + + /// Returns the length + #[inline] + pub fn len(&self) -> usize { + self.len + } + + /// Returns true if this array is empty + #[inline] + pub fn is_empty(&self) -> bool { + self.len == 0 + } + + /// Returns the size of each element + #[inline] + pub fn element_size(&self) -> usize { + self.element_size + } + /// Returns the raw byte data #[inline] pub fn values(&self) -> &[u8] { @@ -346,7 +516,7 @@ impl FixedSizeBinaryArrayData { /// Returns the null buffer if any #[inline] - pub fn null_buffer(&self) -> Option<&NullBuffer> { + pub fn nulls(&self) -> Option<&NullBuffer> { self.nulls.as_ref() } @@ -355,4 +525,36 @@ impl FixedSizeBinaryArrayData { pub fn data_type(&self) -> &DataType { &self.data_type } + + /// Returns the underlying parts of this [`FixedSizeBinaryArrayData`] + pub fn into_parts(self) -> (DataType, Buffer, Option) { + (self.data_type, self.values, self.nulls) + } + + /// Returns a zero-copy slice of this array + pub fn slice(&self, offset: usize, len: usize) -> Self { + let offset_element = offset.checked_mul(self.element_size).expect("overflow"); + let len_element = len.checked_mul(self.element_size).expect("overflow"); + let values = self.values.slice_with_length(offset_element, len_element); + + Self { + len, + values, + data_type: self.data_type.clone(), + element_size: self.element_size, + nulls: self.nulls().as_ref().map(|x| x.slice(offset, len)), + } + } + + /// Returns an [`ArrayDataLayout`] representation of this + pub(crate) fn layout(&self) -> ArrayDataLayout<'_> { + ArrayDataLayout { + data_type: &self.data_type, + len: self.len, + offset: 0, + nulls: self.nulls.as_ref(), + buffers: Buffers::one(&self.values), + child_data: &[], + } + } } diff --git a/arrow-data/src/data/dictionary.rs b/arrow-data/src/data/dictionary.rs index 2ec4ee005287..68ad83dcb1d0 100644 --- a/arrow-data/src/data/dictionary.rs +++ b/arrow-data/src/data/dictionary.rs @@ -16,7 +16,8 @@ // under the License. use crate::data::types::DictionaryKeyType; -use crate::ArrayData; +use crate::data::ArrayDataLayout; +use crate::{ArrayData, ArrayDataBuilder, Buffers}; use arrow_buffer::buffer::{NullBuffer, ScalarBuffer}; use arrow_buffer::ArrowNativeType; use arrow_schema::DataType; @@ -85,6 +86,23 @@ dictionary!(u16, UInt16); dictionary!(u32, UInt32); dictionary!(u64, UInt64); +/// Applies op to each variant of [`ArrayDataDictionary`] +#[macro_export] +macro_rules! dictionary_op { + ($array:ident, $op:block) => { + match $array { + ArrayDataDictionary::Int8($array) => $op + ArrayDataDictionary::Int16($array) => $op + ArrayDataDictionary::Int32($array) => $op + ArrayDataDictionary::Int64($array) => $op + ArrayDataDictionary::UInt8($array) => $op + ArrayDataDictionary::UInt16($array) => $op + ArrayDataDictionary::UInt32($array) => $op + ArrayDataDictionary::UInt64($array) => $op + } + }; +} + /// An enumeration of the types of [`DictionaryArrayData`] #[derive(Debug, Clone)] pub enum ArrayDataDictionary { @@ -108,6 +126,46 @@ impl ArrayDataDictionary { pub fn downcast(self) -> Option> { K::downcast(self) } + + /// Returns the values of this dictionary + pub fn values(&self) -> &ArrayData { + let s = self; + dictionary_op!(s, { s.values() }) + } + + /// Returns a zero-copy slice of this array + pub fn slice(&self, offset: usize, len: usize) -> Self { + let s = self; + dictionary_op!(s, { s.slice(offset, len).into() }) + } + + /// Returns an [`ArrayDataLayout`] representation of this + pub(crate) fn layout(&self) -> ArrayDataLayout<'_> { + let s = self; + dictionary_op!(s, { s.layout() }) + } + + /// Creates a new [`ArrayDataDictionary`] from raw buffers + /// + /// # Safety + /// + /// See [`Self::new_unchecked`] + pub(crate) unsafe fn from_raw( + builder: ArrayDataBuilder, + key: DictionaryKeyType, + ) -> Self { + use DictionaryKeyType::*; + match key { + Int8 => Self::Int8(DictionaryArrayData::from_raw(builder)), + Int16 => Self::Int16(DictionaryArrayData::from_raw(builder)), + Int32 => Self::Int32(DictionaryArrayData::from_raw(builder)), + Int64 => Self::Int64(DictionaryArrayData::from_raw(builder)), + UInt8 => Self::UInt8(DictionaryArrayData::from_raw(builder)), + UInt16 => Self::UInt16(DictionaryArrayData::from_raw(builder)), + UInt32 => Self::UInt32(DictionaryArrayData::from_raw(builder)), + UInt64 => Self::UInt64(DictionaryArrayData::from_raw(builder)), + } + } } impl From> for ArrayDataDictionary { @@ -122,7 +180,7 @@ pub struct DictionaryArrayData { data_type: DataType, nulls: Option, keys: ScalarBuffer, - child: Box, + values: Box, } impl DictionaryArrayData { @@ -144,10 +202,39 @@ impl DictionaryArrayData { data_type, nulls, keys, - child: Box::new(child), + values: Box::new(child), + } + } + + /// Creates a new [`DictionaryArrayData`] from raw buffers + /// + /// # Safety + /// + /// See [`Self::new_unchecked`] + pub(crate) unsafe fn from_raw(builder: ArrayDataBuilder) -> Self { + let keys = builder.buffers.into_iter().next().unwrap(); + let keys = ScalarBuffer::new(keys, builder.offset, builder.len); + let values = builder.child_data.into_iter().next().unwrap(); + Self { + keys, + data_type: builder.data_type, + nulls: builder.nulls, + values: Box::new(values), } } + /// Returns the length + #[inline] + pub fn len(&self) -> usize { + self.keys.len() + } + + /// Returns true if this array is empty + #[inline] + pub fn is_empty(&self) -> bool { + self.keys.is_empty() + } + /// Returns the null buffer if any #[inline] pub fn nulls(&self) -> Option<&NullBuffer> { @@ -160,10 +247,10 @@ impl DictionaryArrayData { &self.keys } - /// Returns the child data + /// Returns the values data #[inline] - pub fn child(&self) -> &ArrayData { - self.child.as_ref() + pub fn values(&self) -> &ArrayData { + self.values.as_ref() } /// Returns the data type of this array @@ -171,4 +258,33 @@ impl DictionaryArrayData { pub fn data_type(&self) -> &DataType { &self.data_type } + + /// Returns the underlying parts of this [`DictionaryArrayData`] + pub fn into_parts( + self, + ) -> (DataType, ScalarBuffer, Option, ArrayData) { + (self.data_type, self.keys, self.nulls, *self.values) + } + + /// Returns a zero-copy slice of this array + pub fn slice(&self, offset: usize, len: usize) -> Self { + Self { + keys: self.keys.slice(offset, len), + data_type: self.data_type.clone(), + nulls: self.nulls.as_ref().map(|x| x.slice(offset, len)), + values: self.values.clone(), + } + } + + /// Returns an [`ArrayDataLayout`] representation of this + pub(crate) fn layout(&self) -> ArrayDataLayout<'_> { + ArrayDataLayout { + data_type: &self.data_type, + len: self.keys.len(), + offset: 0, + nulls: self.nulls.as_ref(), + buffers: Buffers::one(self.keys.inner()), + child_data: std::slice::from_ref(self.values.as_ref()), + } + } } diff --git a/arrow-data/src/data/list.rs b/arrow-data/src/data/list.rs index 59909289e933..bbc4e81cc5e8 100644 --- a/arrow-data/src/data/list.rs +++ b/arrow-data/src/data/list.rs @@ -16,9 +16,10 @@ // under the License. use crate::data::types::OffsetType; -use crate::ArrayData; -use arrow_buffer::buffer::{NullBuffer, ScalarBuffer}; -use arrow_buffer::{ArrowNativeType, Buffer}; +use crate::data::ArrayDataLayout; +use crate::{ArrayData, ArrayDataBuilder, Buffers}; +use arrow_buffer::buffer::{NullBuffer, OffsetBuffer, ScalarBuffer}; +use arrow_buffer::ArrowNativeType; use arrow_schema::DataType; mod private { @@ -113,6 +114,17 @@ impl private::ListOffsetSealed for i64 { } } +/// Applies op to each variant of [`ListArrayData`] +#[macro_export] +macro_rules! list_op { + ($array:ident, $op:block) => { + match $array { + ArrayDataList::Small($array) => $op + ArrayDataList::Large($array) => $op + } + }; +} + /// An enumeration of the types of [`ListArrayData`] #[derive(Debug, Clone)] pub enum ArrayDataList { @@ -130,6 +142,36 @@ impl ArrayDataList { pub fn downcast(self) -> Option> { O::downcast(self) } + + /// Returns the values of this [`ArrayDataList`] + pub fn values(&self) -> &ArrayData { + let s = self; + list_op!(s, { s.values() }) + } + + /// Returns a zero-copy slice of this array + pub fn slice(&self, offset: usize, len: usize) -> Self { + let s = self; + list_op!(s, { s.slice(offset, len).into() }) + } + + /// Returns an [`ArrayDataLayout`] representation of this + pub(crate) fn layout(&self) -> ArrayDataLayout<'_> { + let s = self; + list_op!(s, { s.layout() }) + } + + /// Creates a new [`ArrayDataList`] from raw buffers + /// + /// # Safety + /// + /// See [`ListArrayData::new_unchecked`] + pub(crate) unsafe fn from_raw(builder: ArrayDataBuilder, offset: OffsetType) -> Self { + match offset { + OffsetType::Int32 => Self::Small(ListArrayData::from_raw(builder)), + OffsetType::Int64 => Self::Large(ListArrayData::from_raw(builder)), + } + } } impl From> for ArrayDataList { @@ -143,8 +185,8 @@ impl From> for ArrayDataList { pub struct ListArrayData { data_type: DataType, nulls: Option, - offsets: ScalarBuffer, - child: Box, + offsets: OffsetBuffer, + values: Box, } impl ListArrayData { @@ -157,18 +199,56 @@ impl ListArrayData { /// - `data_type` must be valid for this layout pub unsafe fn new_unchecked( data_type: DataType, - offsets: ScalarBuffer, + offsets: OffsetBuffer, nulls: Option, - child: ArrayData, + values: ArrayData, ) -> Self { Self { data_type, nulls, offsets, - child: Box::new(child), + values: Box::new(values), } } + /// Creates a new [`ListArrayData`] from an [`ArrayDataBuilder`] + /// + /// # Safety + /// + /// See [`Self::new_unchecked`] + pub(crate) unsafe fn from_raw(builder: ArrayDataBuilder) -> Self { + let offsets = builder.buffers.into_iter().next().unwrap(); + let values = builder.child_data.into_iter().next().unwrap(); + + let offsets = match builder.len { + 0 => OffsetBuffer::new_empty(), + _ => OffsetBuffer::new_unchecked(ScalarBuffer::new( + offsets, + builder.offset, + builder.len + 1, + )), + }; + + Self { + offsets, + data_type: builder.data_type, + nulls: builder.nulls, + values: Box::new(values), + } + } + + /// Returns the length + #[inline] + pub fn len(&self) -> usize { + self.offsets.len().wrapping_sub(1) + } + + /// Returns true if this array is empty + #[inline] + pub fn is_empty(&self) -> bool { + self.offsets.len() <= 1 + } + /// Returns the null buffer if any #[inline] pub fn nulls(&self) -> Option<&NullBuffer> { @@ -177,14 +257,14 @@ impl ListArrayData { /// Returns the offsets #[inline] - pub fn offsets(&self) -> &[O] { + pub fn offsets(&self) -> &OffsetBuffer { &self.offsets } - /// Returns the child data + /// Returns the values of this [`ListArrayData`] #[inline] - pub fn child(&self) -> &ArrayData { - self.child.as_ref() + pub fn values(&self) -> &ArrayData { + self.values.as_ref() } /// Returns the data type of this array @@ -192,12 +272,43 @@ impl ListArrayData { pub fn data_type(&self) -> &DataType { &self.data_type } + + /// Returns the underlying parts of this [`ListArrayData`] + pub fn into_parts( + self, + ) -> (DataType, OffsetBuffer, Option, ArrayData) { + (self.data_type, self.offsets, self.nulls, *self.values) + } + + /// Returns a zero-copy slice of this array + pub fn slice(&self, offset: usize, len: usize) -> Self { + Self { + data_type: self.data_type.clone(), + nulls: self.nulls.as_ref().map(|x| x.slice(offset, len)), + offsets: self.offsets.slice(offset, len), + values: self.values.clone(), + } + } + + /// Returns an [`ArrayDataLayout`] representation of this + pub(crate) fn layout(&self) -> ArrayDataLayout<'_> { + ArrayDataLayout { + data_type: &self.data_type, + len: self.len(), + offset: 0, + nulls: self.nulls.as_ref(), + buffers: Buffers::one(self.offsets.inner().inner()), + child_data: std::slice::from_ref(self.values.as_ref()), + } + } } /// ArrayData for [fixed-size list arrays](https://arrow.apache.org/docs/format/Columnar.html#fixed-size-list-layout) #[derive(Debug, Clone)] pub struct FixedSizeListArrayData { data_type: DataType, + len: usize, + element_size: usize, nulls: Option, child: Box, } @@ -208,19 +319,58 @@ impl FixedSizeListArrayData { /// # Safety /// /// - `data_type` must be valid for this layout - /// - `nulls.len() == values.len() / element_size` + /// - `nulls.len() == values.len() / element_size == len` pub unsafe fn new_unchecked( data_type: DataType, + len: usize, + element_size: usize, nulls: Option, child: ArrayData, ) -> Self { Self { data_type, + len, + element_size, nulls, child: Box::new(child), } } + /// Creates a new [`FixedSizeListArrayData`] from raw buffers + /// + /// # Safety + /// + /// See [`FixedSizeListArrayData::new_unchecked`] + pub(crate) unsafe fn from_raw(builder: ArrayDataBuilder, size: usize) -> Self { + let child = + builder.child_data[0].slice(builder.offset * size, builder.len * size); + Self { + data_type: builder.data_type, + len: builder.len, + element_size: size, + nulls: builder.nulls, + child: Box::new(child), + } + } + + /// Returns the length + #[inline] + pub fn len(&self) -> usize { + self.len + } + + /// Returns true if this array is empty + #[inline] + pub fn is_empty(&self) -> bool { + self.len == 0 + } + + /// Returns the size of each element + #[inline] + pub fn element_size(&self) -> usize { + self.element_size + } + /// Returns the null buffer if any #[inline] pub fn nulls(&self) -> Option<&NullBuffer> { @@ -238,4 +388,36 @@ impl FixedSizeListArrayData { pub fn data_type(&self) -> &DataType { &self.data_type } + + /// Returns the underlying parts of this [`FixedSizeListArrayData`] + pub fn into_parts(self) -> (DataType, Option, ArrayData) { + (self.data_type, self.nulls, *self.child) + } + + /// Returns a zero-copy slice of this array + pub fn slice(&self, offset: usize, len: usize) -> Self { + let offset_element = offset.checked_mul(self.element_size).expect("overflow"); + let len_element = len.checked_mul(self.element_size).expect("overflow"); + let child = self.child.slice(offset_element, len_element); + + Self { + len, + data_type: self.data_type.clone(), + element_size: self.element_size, + nulls: self.nulls.as_ref().map(|x| x.slice(offset, len)), + child: Box::new(child), + } + } + + /// Returns an [`ArrayDataLayout`] representation of this + pub(crate) fn layout(&self) -> ArrayDataLayout<'_> { + ArrayDataLayout { + data_type: &self.data_type, + len: self.len, + offset: 0, + nulls: self.nulls.as_ref(), + buffers: Buffers::default(), + child_data: std::slice::from_ref(self.child.as_ref()), + } + } } diff --git a/arrow-data/src/data/mod.rs b/arrow-data/src/data/mod.rs index 051deef07305..446c148ac4ea 100644 --- a/arrow-data/src/data/mod.rs +++ b/arrow-data/src/data/mod.rs @@ -33,6 +33,8 @@ use crate::equal; mod buffers; pub use buffers::*; +#[allow(unused)] // Private until ready (#1176) +mod boolean; #[allow(unused)] // Private until ready (#1176) mod bytes; #[allow(unused)] // Private until ready (#1176) @@ -40,6 +42,8 @@ mod dictionary; #[allow(unused)] // Private until ready (#1176) mod list; #[allow(unused)] // Private until ready (#1176) +mod null; +#[allow(unused)] // Private until ready (#1176) mod primitive; #[allow(unused)] // Private until ready (#1176) mod run; @@ -277,7 +281,6 @@ impl ArrayData { /// Note: This is a low level API and most users of the arrow /// crate should create arrays using the methods in the `array` /// module. - #[allow(clippy::let_and_return)] pub unsafe fn new_unchecked( data_type: DataType, len: usize, @@ -287,27 +290,17 @@ impl ArrayData { buffers: Vec, child_data: Vec, ) -> Self { - let nulls = null_bit_buffer - .map(|b| BooleanBuffer::new(b, offset, len)) - .map(|b| match null_count { - None => NullBuffer::new(b), - Some(null_count) => NullBuffer::new_unchecked(b, null_count), - }) - .filter(|b| b.null_count() > 0); - - let new_self = Self { + ArrayDataBuilder { data_type, len, + null_count, + null_bit_buffer, + nulls: None, offset, buffers, child_data, - nulls, - }; - - // Provide a force_validate mode - #[cfg(feature = "force_validate")] - new_self.validate_data().unwrap(); - new_self + } + .build_unchecked() } /// Create a new ArrayData, validating that the provided buffers form a valid @@ -358,7 +351,7 @@ impl ArrayData { // We don't need to validate children as we can assume that the // [`ArrayData`] in `child_data` have already been validated through // a call to `ArrayData::try_new` or created using unsafe - new_self.validate_data()?; + ArrayDataLayout::new(&new_self).validate_data()?; Ok(new_self) } @@ -448,14 +441,15 @@ impl ArrayData { /// If multiple [`ArrayData`]s refer to the same underlying /// [`Buffer`]s they will both report the same size. pub fn get_buffer_memory_size(&self) -> usize { + let s = ArrayDataLayout::new(self); let mut size = 0; - for buffer in &self.buffers { + for buffer in s.buffers { size += buffer.capacity(); } - if let Some(bitmap) = &self.nulls { + if let Some(bitmap) = s.nulls { size += bitmap.buffer().capacity() } - for child in &self.child_data { + for child in s.child_data { size += child.get_buffer_memory_size(); } size @@ -474,14 +468,15 @@ impl ArrayData { /// first `20` elements, then [`Self::get_slice_memory_size`] on the /// sliced [`ArrayData`] would return `20 * 8 = 160`. pub fn get_slice_memory_size(&self) -> Result { + let s = ArrayDataLayout::new(self); let mut result: usize = 0; - let layout = layout(&self.data_type); + let layout = layout(s.data_type); for spec in layout.buffers.iter() { match spec { BufferSpec::FixedWidth { byte_width } => { let buffer_size = - self.len.checked_mul(*byte_width).ok_or_else(|| { + s.len.checked_mul(*byte_width).ok_or_else(|| { ArrowError::ComputeError( "Integer overflow computing buffer size".to_string(), ) @@ -490,26 +485,26 @@ impl ArrayData { } BufferSpec::VariableWidth => { let buffer_len: usize; - match self.data_type { + match s.data_type { DataType::Utf8 | DataType::Binary => { - let offsets = self.typed_offsets::()?; - buffer_len = (offsets[self.len] - offsets[0] ) as usize; + let offsets = s.typed_offsets::()?; + buffer_len = (offsets[s.len] - offsets[0]) as usize; } DataType::LargeUtf8 | DataType::LargeBinary => { - let offsets = self.typed_offsets::()?; - buffer_len = (offsets[self.len] - offsets[0]) as usize; + let offsets = s.typed_offsets::()?; + buffer_len = (offsets[s.len] - offsets[0]) as usize; } _ => { return Err(ArrowError::NotYetImplemented(format!( - "Invalid data type for VariableWidth buffer. Expected Utf8, LargeUtf8, Binary or LargeBinary. Got {}", - self.data_type + "Invalid data type for VariableWidth buffer. Expected Utf8, LargeUtf8, Binary or LargeBinary. Got {}", + s.data_type ))) } }; result += buffer_len; } BufferSpec::BitMap => { - let buffer_size = bit_util::ceil(self.len, 8); + let buffer_size = bit_util::ceil(s.len, 8); result += buffer_size; } BufferSpec::AlwaysNull => { @@ -518,11 +513,11 @@ impl ArrayData { } } - if self.nulls().is_some() { - result += bit_util::ceil(self.len, 8); + if s.nulls.is_some() { + result += bit_util::ceil(s.len, 8); } - for child in &self.child_data { + for child in s.child_data { result += child.get_slice_memory_size()?; } Ok(result) @@ -537,17 +532,18 @@ impl ArrayData { /// [`Self::get_buffer_memory_size`] + /// `size_of_val(child)` for all children pub fn get_array_memory_size(&self) -> usize { + let s = ArrayDataLayout::new(self); let mut size = mem::size_of_val(self); // Calculate rest of the fields top down which contain actual data - for buffer in &self.buffers { + for buffer in s.buffers { size += mem::size_of::(); size += buffer.capacity(); } - if let Some(nulls) = &self.nulls { + if let Some(nulls) = s.nulls { size += nulls.buffer().capacity(); } - for child in &self.child_data { + for child in s.child_data { size += child.get_array_memory_size(); } @@ -598,14 +594,8 @@ impl ArrayData { /// This function panics if: /// * the buffer is not byte-aligned with type T, or /// * the datatype is `Boolean` (it corresponds to a bit-packed buffer where the offset is not applicable) - #[inline] pub fn buffer(&self, buffer: usize) -> &[T] { - let values = unsafe { self.buffers[buffer].as_slice().align_to::() }; - if !values.0.is_empty() || !values.2.is_empty() { - panic!("The buffer is not byte-aligned with its interpretation") - }; - assert_ne!(self.data_type, DataType::Boolean); - &values.1[self.offset..] + self.buffers()[buffer].typed_data() } /// Returns a new [`ArrayData`] valid for `data_type` containing `len` null values @@ -740,11 +730,101 @@ impl ArrayData { /// See [ArrayData::validate_data] to validate fully the offset content /// and the validity of utf8 data pub fn validate(&self) -> Result<(), ArrowError> { + ArrayDataLayout::new(self).validate() + } + + /// Validate that the data contained within this [`ArrayData`] is valid + /// + /// 1. Null count is correct + /// 2. All offsets are valid + /// 3. All String data is valid UTF-8 + /// 4. All dictionary offsets are valid + /// + /// Internally this calls: + /// + /// * [`Self::validate`] + /// * [`Self::validate_nulls`] + /// * [`Self::validate_values`] + /// + /// Note: this does not recurse into children, for a recursive variant + /// see [`Self::validate_full`] + pub fn validate_data(&self) -> Result<(), ArrowError> { + ArrayDataLayout::new(self).validate_data() + } + + /// Performs a full recursive validation of this [`ArrayData`] and all its children + /// + /// This is equivalent to calling [`Self::validate_data`] on this [`ArrayData`] + /// and all its children recursively + pub fn validate_full(&self) -> Result<(), ArrowError> { + ArrayDataLayout::new(self).validate_full() + } + + /// Validates the values stored within this [`ArrayData`] are valid + /// without recursing into child [`ArrayData`] + /// + /// Does not (yet) check + /// 1. Union type_ids are valid see [#85](https://github.com/apache/arrow-rs/issues/85) + /// Validates the the null count is correct and that any + /// nullability requirements of its children are correct + pub fn validate_nulls(&self) -> Result<(), ArrowError> { + ArrayDataLayout::new(self).validate_nulls() + } + + /// Validates the values stored within this [`ArrayData`] are valid + /// without recursing into child [`ArrayData`] + /// + /// Does not (yet) check + /// 1. Union type_ids are valid see [#85](https://github.com/apache/arrow-rs/issues/85) + pub fn validate_values(&self) -> Result<(), ArrowError> { + ArrayDataLayout::new(self).validate_values() + } + + /// Returns true if this `ArrayData` is equal to `other`, using pointer comparisons + /// to determine buffer equality. This is cheaper than `PartialEq::eq` but may + /// return false when the arrays are logically equal + pub fn ptr_eq(&self, other: &Self) -> bool { + ArrayDataLayout::new(self).ptr_eq(&ArrayDataLayout::new(other)) + } + + /// Converts this [`ArrayData`] into an [`ArrayDataBuilder`] + pub fn into_builder(self) -> ArrayDataBuilder { + self.into() + } +} + +/// A flat representation of [`ArrayData`] +/// +/// This is temporary measure to bridge the gap between the strongly-typed +/// ArrayData enumeration and the older-style struct representation (#1799) +#[derive(Copy, Clone)] +pub(crate) struct ArrayDataLayout<'a> { + data_type: &'a DataType, + len: usize, + offset: usize, + nulls: Option<&'a NullBuffer>, + buffers: Buffers<'a>, + child_data: &'a [ArrayData], +} + +impl<'a> ArrayDataLayout<'a> { + fn new(data: &'a ArrayData) -> Self { + Self { + data_type: &data.data_type, + len: data.len, + offset: data.offset, + nulls: data.nulls.as_ref(), + buffers: Buffers::from_slice(&data.buffers), + child_data: &data.child_data, + } + } + + fn validate(&self) -> Result<(), ArrowError> { // Need at least this mich space in each buffer let len_plus_offset = self.len + self.offset; // Check that the data layout conforms to the spec - let layout = layout(&self.data_type); + let layout = layout(self.data_type); if !layout.can_contain_null_mask && self.nulls.is_some() { return Err(ArrowError::InvalidArgumentError(format!( @@ -799,7 +879,7 @@ impl ArrayData { } // check null bit buffer size - if let Some(nulls) = self.nulls() { + if let Some(nulls) = self.nulls { if nulls.null_count() > self.len { return Err(ArrowError::InvalidArgumentError(format!( "null_count {} for an array exceeds length of {} elements", @@ -1013,7 +1093,7 @@ impl ArrayData { run_ends_data.len, values_data.len ))); } - if run_ends_data.null_count() > 0 { + if run_ends_data.nulls.is_some() { return Err(ArrowError::InvalidArgumentError( "Found null values in run_ends array. The run_ends array should not have null values.".to_string(), )); @@ -1061,17 +1141,17 @@ impl ArrayData { fn get_single_valid_child_data( &self, expected_type: &DataType, - ) -> Result<&ArrayData, ArrowError> { + ) -> Result, ArrowError> { self.validate_num_child_data(1)?; self.get_valid_child_data(0, expected_type) } /// Returns `Err` if self.child_data does not have exactly `expected_len` elements fn validate_num_child_data(&self, expected_len: usize) -> Result<(), ArrowError> { - if self.child_data().len() != expected_len { + if self.child_data.len() != expected_len { Err(ArrowError::InvalidArgumentError(format!( "Value data for {} should contain {} child data array(s), had {}", - self.data_type(), + self.data_type, expected_len, self.child_data.len() ))) @@ -1086,7 +1166,7 @@ impl ArrayData { &self, i: usize, expected_type: &DataType, - ) -> Result<&ArrayData, ArrowError> { + ) -> Result { let values_data = self.child_data .get(i) .ok_or_else(|| { @@ -1095,8 +1175,9 @@ impl ArrayData { self.data_type, i+1, self.child_data.len() )) })?; + let values_data = ArrayDataLayout::new(values_data); - if expected_type != &values_data.data_type { + if expected_type != values_data.data_type { return Err(ArrowError::InvalidArgumentError(format!( "Child type mismatch for {}. Expected {} but child data had {}", self.data_type, expected_type, values_data.data_type @@ -1107,22 +1188,7 @@ impl ArrayData { Ok(values_data) } - /// Validate that the data contained within this [`ArrayData`] is valid - /// - /// 1. Null count is correct - /// 2. All offsets are valid - /// 3. All String data is valid UTF-8 - /// 4. All dictionary offsets are valid - /// - /// Internally this calls: - /// - /// * [`Self::validate`] - /// * [`Self::validate_nulls`] - /// * [`Self::validate_values`] - /// - /// Note: this does not recurse into children, for a recursive variant - /// see [`Self::validate_full`] - pub fn validate_data(&self) -> Result<(), ArrowError> { + fn validate_data(&self) -> Result<(), ArrowError> { self.validate()?; self.validate_nulls()?; @@ -1130,11 +1196,7 @@ impl ArrayData { Ok(()) } - /// Performs a full recursive validation of this [`ArrayData`] and all its children - /// - /// This is equivalent to calling [`Self::validate_data`] on this [`ArrayData`] - /// and all its children recursively - pub fn validate_full(&self) -> Result<(), ArrowError> { + fn validate_full(&self) -> Result<(), ArrowError> { self.validate_data()?; // validate all children recursively self.child_data @@ -1151,14 +1213,7 @@ impl ArrayData { Ok(()) } - /// Validates the values stored within this [`ArrayData`] are valid - /// without recursing into child [`ArrayData`] - /// - /// Does not (yet) check - /// 1. Union type_ids are valid see [#85](https://github.com/apache/arrow-rs/issues/85) - /// Validates the the null count is correct and that any - /// nullability requirements of its children are correct - pub fn validate_nulls(&self) -> Result<(), ArrowError> { + fn validate_nulls(&self) -> Result<(), ArrowError> { if let Some(nulls) = &self.nulls { let actual = nulls.len() - nulls.inner().count_set_bits(); if actual != nulls.null_count() { @@ -1176,11 +1231,12 @@ impl ArrayData { match &self.data_type { DataType::List(f) | DataType::LargeList(f) | DataType::Map(f, _) => { if !f.is_nullable() { - self.validate_non_nullable(None, 0, &self.child_data[0])? + let child = ArrayDataLayout::new(&self.child_data[0]); + self.validate_non_nullable(None, 0, child)? } } DataType::FixedSizeList(field, len) => { - let child = &self.child_data[0]; + let child = ArrayDataLayout::new(&self.child_data[0]); if !field.is_nullable() { match &self.nulls { Some(nulls) => { @@ -1209,7 +1265,8 @@ impl ArrayData { } } DataType::Struct(fields) => { - for (field, child) in fields.iter().zip(&self.child_data) { + for (field, child) in fields.iter().zip(self.child_data) { + let child = ArrayDataLayout::new(child); if !field.is_nullable() { match &self.nulls { Some(n) => self.validate_non_nullable( @@ -1233,24 +1290,24 @@ impl ArrayData { &self, mask: Option<&Buffer>, offset: usize, - data: &ArrayData, + child: ArrayDataLayout<'_>, ) -> Result<(), ArrowError> { let mask = match mask { Some(mask) => mask.as_ref(), - None => return match data.null_count() { + None => return match child.nulls.map(|x| x.null_count()).unwrap_or_default() { 0 => Ok(()), _ => Err(ArrowError::InvalidArgumentError(format!( "non-nullable child of type {} contains nulls not present in parent {}", - data.data_type(), + child.data_type, self.data_type ))), }, }; - match data.nulls() { + match child.nulls { Some(nulls) => { - let mask = BitChunks::new(mask, offset, data.len); - let nulls = BitChunks::new(nulls.validity(), nulls.offset(), data.len); + let mask = BitChunks::new(mask, offset, child.len); + let nulls = BitChunks::new(nulls.validity(), nulls.offset(), child.len); mask .iter() .zip(nulls.iter()) @@ -1261,7 +1318,7 @@ impl ArrayData { if (m & !c) != 0 { return Err(ArrowError::InvalidArgumentError(format!( "non-nullable child of type {} contains nulls not present in parent", - data.data_type() + child.data_type ))) } Ok(()) @@ -1276,7 +1333,7 @@ impl ArrayData { /// /// Does not (yet) check /// 1. Union type_ids are valid see [#85](https://github.com/apache/arrow-rs/issues/85) - pub fn validate_values(&self) -> Result<(), ArrowError> { + fn validate_values(&self) -> Result<(), ArrowError> { match &self.data_type { DataType::Utf8 => self.validate_utf8::(), DataType::LargeUtf8 => self.validate_utf8::(), @@ -1286,11 +1343,11 @@ impl ArrayData { } DataType::List(_) | DataType::Map(_, _) => { let child = &self.child_data[0]; - self.validate_offsets_full::(child.len) + self.validate_offsets_full::(child.len()) } DataType::LargeList(_) => { let child = &self.child_data[0]; - self.validate_offsets_full::(child.len) + self.validate_offsets_full::(child.len()) } DataType::Union(_, _, _) => { // Validate Union Array as part of implementing new Union semantics @@ -1301,7 +1358,7 @@ impl ArrayData { Ok(()) } DataType::Dictionary(key_type, _value_type) => { - let dictionary_length: i64 = self.child_data[0].len.try_into().unwrap(); + let dictionary_length: i64 = self.child_data[0].len().try_into().unwrap(); let max_value = dictionary_length - 1; match key_type.as_ref() { DataType::UInt8 => self.check_bounds::(max_value), @@ -1316,7 +1373,7 @@ impl ArrayData { } } DataType::RunEndEncoded(run_ends, _values) => { - let run_ends_data = self.child_data()[0].clone(); + let run_ends_data = ArrayDataLayout::new(&self.child_data[0]); match run_ends.data_type() { DataType::Int16 => run_ends_data.check_run_ends::(), DataType::Int32 => run_ends_data.check_run_ends::(), @@ -1460,7 +1517,7 @@ impl ArrayData { indexes.iter().enumerate().try_for_each(|(i, &dict_index)| { // Do not check the value is null (value can be arbitrary) - if self.is_null(i) { + if self.nulls.map(|x| x.is_null(i)).unwrap_or_default() { return Ok(()); } let dict_index: i64 = dict_index.try_into().map_err(|_| { @@ -1483,7 +1540,7 @@ impl ArrayData { where T: ArrowNativeType + TryInto + num::Num + std::fmt::Display, { - let values = self.typed_buffer::(0, self.len())?; + let values = self.typed_buffer::(0, self.len)?; let mut prev_value: i64 = 0_i64; values.iter().enumerate().try_for_each(|(ix, &inp_value)| { let value: i64 = inp_value.try_into().map_err(|_| { @@ -1548,11 +1605,6 @@ impl ArrayData { .zip(other.child_data.iter()) .all(|(a, b)| a.ptr_eq(b)) } - - /// Converts this [`ArrayData`] into an [`ArrayDataBuilder`] - pub fn into_builder(self) -> ArrayDataBuilder { - self.into() - } } /// Return the expected [`DataTypeLayout`] Arrays of this data @@ -1807,15 +1859,14 @@ impl ArrayDataBuilder { /// The same caveats as [`ArrayData::new_unchecked`] /// apply. #[allow(clippy::let_and_return)] - pub unsafe fn build_unchecked(self) -> ArrayData { - let nulls = self.nulls.or_else(|| { - let buffer = self.null_bit_buffer?; + pub unsafe fn build_unchecked(mut self) -> ArrayData { + if let Some(buffer) = self.null_bit_buffer.take() { let buffer = BooleanBuffer::new(buffer, self.offset, self.len); - Some(match self.null_count { + self.nulls = Some(match self.null_count { Some(n) => NullBuffer::new_unchecked(buffer, n), None => NullBuffer::new(buffer), }) - }); + } let data = ArrayData { data_type: self.data_type, @@ -1823,7 +1874,7 @@ impl ArrayDataBuilder { offset: self.offset, buffers: self.buffers, child_data: self.child_data, - nulls, + nulls: self.nulls, }; // Provide a force_validate mode @@ -1837,7 +1888,7 @@ impl ArrayDataBuilder { pub fn build(self) -> Result { let data = unsafe { self.build_unchecked() }; #[cfg(not(feature = "force_validate"))] - data.validate_data()?; + ArrayDataLayout::new(&data).validate_data()?; Ok(data) } } diff --git a/arrow-data/src/data/null.rs b/arrow-data/src/data/null.rs new file mode 100644 index 000000000000..9fa387474f65 --- /dev/null +++ b/arrow-data/src/data/null.rs @@ -0,0 +1,104 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use crate::data::types::PhysicalType; +use crate::data::ArrayDataLayout; +use crate::{ArrayDataBuilder, Buffers}; +use arrow_schema::DataType; + +/// ArrayData for [null arrays](https://arrow.apache.org/docs/format/Columnar.html#null-layout) +#[derive(Debug, Clone)] +pub struct NullArrayData { + data_type: DataType, + len: usize, +} + +impl NullArrayData { + /// Create a new [`NullArrayData`] + /// + /// # Panic + /// + /// - `data_type` is not compatible with `T` + pub fn new(data_type: DataType, len: usize) -> Self { + assert_eq!( + PhysicalType::from(&data_type), + PhysicalType::Null, + "Illegal physical type for NullArrayData of datatype {data_type:?}", + ); + Self { data_type, len } + } + + /// Create a new [`NullArrayData`] + /// + /// # Safety + /// + /// - `PhysicalType::from(&data_type) == PhysicalType::Null` + pub unsafe fn new_unchecked(data_type: DataType, len: usize) -> Self { + Self { data_type, len } + } + + /// Creates a new [`NullArrayData`] from raw buffers + /// + /// # Safety + /// + /// See [`NullArrayData::new_unchecked`] + pub(crate) unsafe fn from_raw(builder: ArrayDataBuilder) -> Self { + Self { + data_type: builder.data_type, + len: builder.len, + } + } + + /// Returns the data type of this array + #[inline] + pub fn data_type(&self) -> &DataType { + &self.data_type + } + + /// Returns the length of this array + #[inline] + pub fn len(&self) -> usize { + self.len + } + + /// Returns the [`DataType`] and length of this [`NullArrayData`] + pub fn into_parts(self) -> (DataType, usize) { + (self.data_type, self.len) + } + + /// Returns a zero-copy slice of this array + pub fn slice(&self, offset: usize, len: usize) -> Self { + let new_len = offset.saturating_add(len); + assert!(new_len <= self.len,); + Self { + data_type: self.data_type.clone(), + len, + } + } + + /// Returns an [`ArrayDataLayout`] representation of this + pub(crate) fn layout(&self) -> ArrayDataLayout<'_> { + ArrayDataLayout { + data_type: &self.data_type, + len: self.len, + offset: 0, + nulls: None, + buffers: Buffers::default(), + child_data: &[], + } + } +} diff --git a/arrow-data/src/data/primitive.rs b/arrow-data/src/data/primitive.rs index 058b3e822056..df456c7751f1 100644 --- a/arrow-data/src/data/primitive.rs +++ b/arrow-data/src/data/primitive.rs @@ -16,6 +16,8 @@ // under the License. use crate::data::types::{PhysicalType, PrimitiveType}; +use crate::data::ArrayDataLayout; +use crate::{ArrayDataBuilder, Buffers}; use arrow_buffer::buffer::{NullBuffer, ScalarBuffer}; use arrow_buffer::{i256, ArrowNativeType}; use arrow_schema::DataType; @@ -46,6 +48,28 @@ pub trait Primitive: private::PrimitiveSealed + ArrowNativeType { const TYPE: PrimitiveType; } +/// Applies op to each variant of [`ArrayDataPrimitive`] +#[macro_export] +macro_rules! primitive_op { + ($array:ident, $op:block) => { + match $array { + ArrayDataPrimitive::Int8($array) => $op + ArrayDataPrimitive::Int16($array) => $op + ArrayDataPrimitive::Int32($array) => $op + ArrayDataPrimitive::Int64($array) => $op + ArrayDataPrimitive::Int128($array) => $op + ArrayDataPrimitive::Int256($array) => $op + ArrayDataPrimitive::UInt8($array) => $op + ArrayDataPrimitive::UInt16($array) => $op + ArrayDataPrimitive::UInt32($array) => $op + ArrayDataPrimitive::UInt64($array) => $op + ArrayDataPrimitive::Float16($array) => $op + ArrayDataPrimitive::Float32($array) => $op + ArrayDataPrimitive::Float64($array) => $op + } + }; +} + macro_rules! primitive { ($t:ty,$v:ident) => { impl Primitive for $t { @@ -90,6 +114,7 @@ primitive!(f32, Float32); primitive!(f64, Float64); /// An enumeration of the types of [`PrimitiveArrayData`] +#[derive(Debug, Clone)] pub enum ArrayDataPrimitive { Int8(PrimitiveArrayData), Int16(PrimitiveArrayData), @@ -116,6 +141,45 @@ impl ArrayDataPrimitive { pub fn downcast(self) -> Option> { P::downcast(self) } + + /// Returns a zero-copy slice of this array + pub fn slice(&self, offset: usize, len: usize) -> Self { + let s = self; + primitive_op!(s, { s.slice(offset, len).into() }) + } + + /// Returns an [`ArrayDataLayout`] representation of this + pub(crate) fn layout(&self) -> ArrayDataLayout<'_> { + let s = self; + primitive_op!(s, { s.layout() }) + } + + /// Creates a new [`ArrayDataPrimitive`] from raw buffers + /// + /// # Safety + /// + /// See [`PrimitiveArrayData::new_unchecked`] + pub(crate) unsafe fn from_raw( + builder: ArrayDataBuilder, + primitive: PrimitiveType, + ) -> Self { + use PrimitiveType::*; + match primitive { + Int8 => Self::Int8(PrimitiveArrayData::from_raw(builder)), + Int16 => Self::Int16(PrimitiveArrayData::from_raw(builder)), + Int32 => Self::Int32(PrimitiveArrayData::from_raw(builder)), + Int64 => Self::Int64(PrimitiveArrayData::from_raw(builder)), + Int128 => Self::Int128(PrimitiveArrayData::from_raw(builder)), + Int256 => Self::Int256(PrimitiveArrayData::from_raw(builder)), + UInt8 => Self::UInt8(PrimitiveArrayData::from_raw(builder)), + UInt16 => Self::UInt16(PrimitiveArrayData::from_raw(builder)), + UInt32 => Self::UInt32(PrimitiveArrayData::from_raw(builder)), + UInt64 => Self::UInt64(PrimitiveArrayData::from_raw(builder)), + Float16 => Self::Float16(PrimitiveArrayData::from_raw(builder)), + Float32 => Self::Float32(PrimitiveArrayData::from_raw(builder)), + Float64 => Self::Float64(PrimitiveArrayData::from_raw(builder)), + } + } } impl From> for ArrayDataPrimitive { @@ -128,8 +192,8 @@ impl From> for ArrayDataPrimitive { #[derive(Debug, Clone)] pub struct PrimitiveArrayData { data_type: DataType, - nulls: Option, values: ScalarBuffer, + nulls: Option, } impl PrimitiveArrayData { @@ -145,13 +209,10 @@ impl PrimitiveArrayData { values: ScalarBuffer, nulls: Option, ) -> Self { - let physical = PhysicalType::from(&data_type); - assert!( - matches!(physical, PhysicalType::Primitive(p) if p == T::TYPE), - "Illegal physical type for PrimitiveArrayData of datatype {:?}, expected {:?} got {:?}", - data_type, - T::TYPE, - physical + assert_eq!( + PhysicalType::from(&data_type), + PhysicalType::Primitive(T::TYPE), + "Illegal physical type for PrimitiveArrayData of datatype {data_type:?}", ); if let Some(n) = nulls.as_ref() { @@ -165,6 +226,21 @@ impl PrimitiveArrayData { } } + /// Creates a new [`PrimitiveArrayData`] from an [`ArrayDataBuilder`] + /// + /// # Safety + /// + /// See [`Self::new_unchecked`] + pub(crate) unsafe fn from_raw(builder: ArrayDataBuilder) -> Self { + let values = builder.buffers.into_iter().next().unwrap(); + let values = ScalarBuffer::new(values, builder.offset, builder.len); + Self { + values, + data_type: builder.data_type, + nulls: builder.nulls, + } + } + /// Returns the null buffer if any #[inline] pub fn nulls(&self) -> Option<&NullBuffer> { @@ -173,7 +249,7 @@ impl PrimitiveArrayData { /// Returns the primitive values #[inline] - pub fn values(&self) -> &[T] { + pub fn values(&self) -> &ScalarBuffer { &self.values } @@ -182,4 +258,30 @@ impl PrimitiveArrayData { pub fn data_type(&self) -> &DataType { &self.data_type } + + /// Returns the underlying parts of this [`PrimitiveArrayData`] + pub fn into_parts(self) -> (DataType, ScalarBuffer, Option) { + (self.data_type, self.values, self.nulls) + } + + /// Returns a zero-copy slice of this array + pub fn slice(&self, offset: usize, len: usize) -> Self { + Self { + data_type: self.data_type.clone(), + values: self.values.slice(offset, len), + nulls: self.nulls.as_ref().map(|x| x.slice(offset, len)), + } + } + + /// Returns an [`ArrayDataLayout`] representation of this + pub(crate) fn layout(&self) -> ArrayDataLayout<'_> { + ArrayDataLayout { + data_type: &self.data_type, + len: self.values.len(), + offset: 0, + nulls: self.nulls.as_ref(), + buffers: Buffers::one(self.values.inner()), + child_data: &[], + } + } } diff --git a/arrow-data/src/data/run.rs b/arrow-data/src/data/run.rs index cd993de1bf25..134d05553a50 100644 --- a/arrow-data/src/data/run.rs +++ b/arrow-data/src/data/run.rs @@ -15,17 +15,20 @@ // specific language governing permissions and limitations // under the License. +use crate::data::primitive::{Primitive, PrimitiveArrayData}; use crate::data::types::RunEndType; -use crate::ArrayData; -use arrow_buffer::buffer::ScalarBuffer; +use crate::data::ArrayDataLayout; +use crate::{ArrayData, ArrayDataBuilder, Buffers}; +use arrow_buffer::buffer::{RunEndBuffer, ScalarBuffer}; use arrow_buffer::ArrowNativeType; use arrow_schema::DataType; -use std::marker::PhantomData; mod private { use super::*; pub trait RunEndSealed { + const ENDS_TYPE: DataType; + /// Downcast [`ArrayDataRun`] to `[RunArrayData`] fn downcast_ref(data: &ArrayDataRun) -> Option<&RunArrayData> where @@ -43,7 +46,7 @@ mod private { } } -pub trait RunEnd: private::RunEndSealed + ArrowNativeType { +pub trait RunEnd: private::RunEndSealed + ArrowNativeType + Primitive { const TYPE: RunEndType; } @@ -53,6 +56,8 @@ macro_rules! run_end { const TYPE: RunEndType = RunEndType::$v; } impl private::RunEndSealed for $t { + const ENDS_TYPE: DataType = DataType::$v; + fn downcast_ref(data: &ArrayDataRun) -> Option<&RunArrayData> { match data { ArrayDataRun::$v(v) => Some(v), @@ -78,7 +83,20 @@ run_end!(i16, Int16); run_end!(i32, Int32); run_end!(i64, Int64); +/// Applies op to each variant of [`ArrayDataRun`] +#[macro_export] +macro_rules! run_op { + ($array:ident, $op:block) => { + match $array { + ArrayDataRun::Int16($array) => $op + ArrayDataRun::Int32($array) => $op + ArrayDataRun::Int64($array) => $op + } + }; +} + /// An enumeration of the types of [`RunArrayData`] +#[derive(Debug, Clone)] pub enum ArrayDataRun { Int16(RunArrayData), Int32(RunArrayData), @@ -88,26 +106,63 @@ pub enum ArrayDataRun { impl ArrayDataRun { /// Downcast this [`ArrayDataRun`] to the corresponding [`RunArrayData`] pub fn downcast_ref(&self) -> Option<&RunArrayData> { - E::downcast_ref(self) + ::downcast_ref(self) } /// Downcast this [`ArrayDataRun`] to the corresponding [`RunArrayData`] pub fn downcast(self) -> Option> { - E::downcast(self) + ::downcast(self) + } + + /// Returns the values of this [`ArrayDataRun`] + #[inline] + pub fn values(&self) -> &ArrayData { + let s = self; + run_op!(s, { s.values() }) + } + + /// Returns a zero-copy slice of this array + pub fn slice(&self, offset: usize, len: usize) -> Self { + let s = self; + run_op!(s, { s.slice(offset, len).into() }) + } + + /// Returns an [`ArrayDataLayout`] representation of this + pub(crate) fn layout(&self) -> ArrayDataLayout<'_> { + let s = self; + run_op!(s, { s.layout() }) + } + + /// Creates a new [`ArrayDataRun`] from raw buffers + /// + /// # Safety + /// + /// See [`RunArrayData::new_unchecked`] + pub(crate) unsafe fn from_raw(builder: ArrayDataBuilder, run: RunEndType) -> Self { + use RunEndType::*; + match run { + Int16 => Self::Int16(RunArrayData::from_raw(builder)), + Int32 => Self::Int32(RunArrayData::from_raw(builder)), + Int64 => Self::Int64(RunArrayData::from_raw(builder)), + } } } impl From> for ArrayDataRun { fn from(value: RunArrayData) -> Self { - E::upcast(value) + ::upcast(value) } } /// ArrayData for [run-end encoded arrays](https://arrow.apache.org/docs/format/Columnar.html#run-end-encoded-layout) +#[derive(Debug, Clone)] pub struct RunArrayData { data_type: DataType, - run_ends: ScalarBuffer, - child: Box, + run_ends: RunEndBuffer, + /// The children of this RunArrayData + /// 1: the run ends + /// 2: the values + children: Box<[ArrayData; 2]>, } impl RunArrayData { @@ -116,22 +171,67 @@ impl RunArrayData { /// # Safety /// /// - `data_type` must be valid for this layout - /// - `run_ends` must contain monotonically increasing, positive values `<= child.len()` + /// - `run_ends` must contain monotonically increasing, positive values `<= len` + /// - `run_ends.len() == child.len()` pub unsafe fn new_unchecked( data_type: DataType, - run_ends: ScalarBuffer, - child: ArrayData, + run_ends: RunEndBuffer, + values: ArrayData, ) -> Self { + let inner = run_ends.inner(); + let child = ArrayDataBuilder::new(E::ENDS_TYPE) + .len(inner.len()) + .buffers(vec![inner.inner().clone()]) + .build_unchecked(); + Self { data_type, run_ends, - child: Box::new(child), + children: Box::new([child, values]), + } + } + + /// Creates a new [`RunArrayData`] from raw buffers + /// + /// # Safety + /// + /// See [`RunArrayData::new_unchecked`] + pub(crate) unsafe fn from_raw(builder: ArrayDataBuilder) -> Self { + let mut iter = builder.child_data.into_iter(); + let child1 = iter.next().unwrap(); + let child2 = iter.next().unwrap(); + + let p = ScalarBuffer::new(child1.buffers[0].clone(), child1.offset, child1.len); + let run_ends = RunEndBuffer::new_unchecked(p, builder.offset, builder.len); + + Self { + run_ends, + data_type: builder.data_type, + children: Box::new([child1, child2]), } } + /// Returns the length + #[inline] + pub fn len(&self) -> usize { + self.run_ends.len() + } + + /// Returns the offset + #[inline] + pub fn offset(&self) -> usize { + self.run_ends.offset() + } + + /// Returns true if this array is empty + #[inline] + pub fn is_empty(&self) -> bool { + self.run_ends.is_empty() + } + /// Returns the run ends #[inline] - pub fn run_ends(&self) -> &[E] { + pub fn run_ends(&self) -> &RunEndBuffer { &self.run_ends } @@ -143,7 +243,34 @@ impl RunArrayData { /// Returns the child data #[inline] - pub fn child(&self) -> &ArrayData { - self.child.as_ref() + pub fn values(&self) -> &ArrayData { + &self.children[1] + } + + /// Returns the underlying parts of this [`RunArrayData`] + pub fn into_parts(self) -> (DataType, RunEndBuffer, ArrayData) { + let child = self.children.into_iter().nth(1).unwrap(); + (self.data_type, self.run_ends, child) + } + + /// Returns a zero-copy slice of this array + pub fn slice(&self, offset: usize, len: usize) -> Self { + Self { + data_type: self.data_type.clone(), + run_ends: self.run_ends.slice(offset, len), + children: self.children.clone(), + } + } + + /// Returns an [`ArrayDataLayout`] representation of this + pub(crate) fn layout(&self) -> ArrayDataLayout<'_> { + ArrayDataLayout { + data_type: &self.data_type, + len: self.run_ends.len(), + offset: self.run_ends.offset(), + nulls: None, + buffers: Buffers::default(), + child_data: self.children.as_ref(), + } } } diff --git a/arrow-data/src/data/struct.rs b/arrow-data/src/data/struct.rs index d9999261902e..8845dbb08f67 100644 --- a/arrow-data/src/data/struct.rs +++ b/arrow-data/src/data/struct.rs @@ -15,7 +15,8 @@ // specific language governing permissions and limitations // under the License. -use crate::ArrayData; +use crate::data::ArrayDataLayout; +use crate::{ArrayData, ArrayDataBuilder, Buffers}; use arrow_buffer::buffer::NullBuffer; use arrow_schema::DataType; @@ -49,6 +50,26 @@ impl StructArrayData { } } + /// Creates a new [`StructArrayData`] from raw buffers + /// + /// # Safety + /// + /// See [`StructArrayData::new_unchecked`] + pub(crate) unsafe fn from_raw(builder: ArrayDataBuilder) -> Self { + let children = builder + .child_data + .into_iter() + .map(|x| x.slice(builder.offset, builder.len)) + .collect(); + + Self { + data_type: builder.data_type, + len: builder.len, + nulls: builder.nulls, + children, + } + } + /// Returns the length of this [`StructArrayData`] #[inline] pub fn len(&self) -> usize { @@ -78,4 +99,31 @@ impl StructArrayData { pub fn data_type(&self) -> &DataType { &self.data_type } + + /// Returns the underlying parts of this [`StructArrayData`] + pub fn into_parts(self) -> (DataType, Option, Vec) { + (self.data_type, self.nulls, self.children) + } + + /// Returns a zero-copy slice of this array + pub fn slice(&self, offset: usize, len: usize) -> Self { + Self { + len, + data_type: self.data_type.clone(), + nulls: self.nulls.as_ref().map(|x| x.slice(offset, len)), + children: self.children.iter().map(|c| c.slice(offset, len)).collect(), + } + } + + /// Returns an [`ArrayDataLayout`] representation of this + pub(crate) fn layout(&self) -> ArrayDataLayout<'_> { + ArrayDataLayout { + data_type: &self.data_type, + len: self.len, + offset: 0, + nulls: self.nulls.as_ref(), + buffers: Buffers::default(), + child_data: &self.children, + } + } } diff --git a/arrow-data/src/data/types.rs b/arrow-data/src/data/types.rs index 3414e481ca66..bb65b42124f3 100644 --- a/arrow-data/src/data/types.rs +++ b/arrow-data/src/data/types.rs @@ -15,7 +15,7 @@ // specific language governing permissions and limitations // under the License. -use arrow_schema::{DataType, IntervalUnit}; +use arrow_schema::{DataType, IntervalUnit, UnionMode}; /// An enumeration of the primitive types implementing [`ArrowNativeType`](arrow_buffer::ArrowNativeType) #[derive(Debug, Copy, Clone, Ord, PartialOrd, Eq, PartialEq, Hash)] @@ -76,12 +76,12 @@ pub enum PhysicalType { Null, Boolean, Primitive(PrimitiveType), - FixedSizeBinary, + FixedSizeBinary(usize), Bytes(OffsetType, BytesType), - FixedSizeList, + FixedSizeList(usize), List(OffsetType), Struct, - Union, + Union(UnionMode), Dictionary(DictionaryKeyType), Run(RunEndType), } @@ -119,16 +119,16 @@ impl From<&DataType> for PhysicalType { DataType::Interval(IntervalUnit::MonthDayNano) => { Self::Primitive(PrimitiveType::Int128) } - DataType::FixedSizeBinary(_) => Self::FixedSizeBinary, + DataType::FixedSizeBinary(size) => Self::FixedSizeBinary(*size as usize), DataType::Binary => Self::Bytes(OffsetType::Int32, BytesType::Binary), DataType::LargeBinary => Self::Bytes(OffsetType::Int64, BytesType::Binary), DataType::Utf8 => Self::Bytes(OffsetType::Int32, BytesType::Utf8), DataType::LargeUtf8 => Self::Bytes(OffsetType::Int64, BytesType::Utf8), DataType::List(_) => Self::List(OffsetType::Int32), - DataType::FixedSizeList(_, _) => Self::FixedSizeList, + DataType::FixedSizeList(_, size) => Self::FixedSizeList(*size as usize), DataType::LargeList(_) => Self::List(OffsetType::Int64), DataType::Struct(_) => Self::Struct, - DataType::Union(_, _, _) => Self::Union, + DataType::Union(_, _, mode) => Self::Union(*mode), DataType::Dictionary(k, _) => match k.as_ref() { DataType::Int8 => Self::Dictionary(DictionaryKeyType::Int8), DataType::Int16 => Self::Dictionary(DictionaryKeyType::Int16), diff --git a/arrow-data/src/data/union.rs b/arrow-data/src/data/union.rs index 7861bd154e71..edf865a22ca1 100644 --- a/arrow-data/src/data/union.rs +++ b/arrow-data/src/data/union.rs @@ -15,9 +15,10 @@ // specific language governing permissions and limitations // under the License. -use crate::ArrayData; +use crate::data::ArrayDataLayout; +use crate::{ArrayData, ArrayDataBuilder, Buffers}; use arrow_buffer::buffer::ScalarBuffer; -use arrow_schema::DataType; +use arrow_schema::{DataType, UnionMode}; /// ArrayData for [union arrays](https://arrow.apache.org/docs/format/Columnar.html#union-layout) #[derive(Debug, Clone)] @@ -51,16 +52,62 @@ impl UnionArrayData { } } + /// Creates a new [`StructArrayData`] from raw buffers + /// + /// # Safety + /// + /// See [`UnionArrayData::new_unchecked`] + pub(crate) unsafe fn from_raw(builder: ArrayDataBuilder, mode: UnionMode) -> Self { + match mode { + UnionMode::Sparse => { + let type_ids = builder.buffers.into_iter().next().unwrap(); + let type_ids = ScalarBuffer::new(type_ids, builder.offset, builder.len); + let children = builder + .child_data + .into_iter() + .map(|x| x.slice(builder.offset, builder.len)) + .collect(); + + Self { + type_ids, + children, + data_type: builder.data_type, + offsets: None, + } + } + UnionMode::Dense => { + let mut iter = builder.buffers.into_iter(); + let type_ids = iter.next().unwrap(); + let offsets = iter.next().unwrap(); + let type_ids = ScalarBuffer::new(type_ids, builder.offset, builder.len); + let offsets = ScalarBuffer::new(offsets, builder.offset, builder.len); + + Self { + type_ids, + data_type: builder.data_type, + offsets: Some(offsets), + children: builder.child_data, + } + } + } + } + + /// Returns the length of this array + #[inline] + pub fn len(&self) -> usize { + self.type_ids.len() + } + /// Returns the type ids for this array #[inline] - pub fn type_ids(&self) -> &[i8] { + pub fn type_ids(&self) -> &ScalarBuffer { &self.type_ids } /// Returns the offsets for this array if this is a dense union #[inline] - pub fn offsets(&self) -> Option<&[i32]> { - self.offsets.as_deref() + pub fn offsets(&self) -> Option<&ScalarBuffer> { + self.offsets.as_ref() } /// Returns the children of this array @@ -74,4 +121,50 @@ impl UnionArrayData { pub fn data_type(&self) -> &DataType { &self.data_type } + + /// Returns the underlying parts of this [`UnionArrayData`] + pub fn into_parts( + self, + ) -> ( + DataType, + ScalarBuffer, + Option>, + Vec, + ) { + (self.data_type, self.type_ids, self.offsets, self.children) + } + + /// Returns a zero-copy slice of this array + pub fn slice(&self, offset: usize, len: usize) -> Self { + let (offsets, children) = match &self.offsets { + Some(offsets) => (Some(offsets.slice(offset, len)), self.children.clone()), + None => ( + None, + self.children.iter().map(|c| c.slice(offset, len)).collect(), + ), + }; + Self { + data_type: self.data_type.clone(), + type_ids: self.type_ids.slice(offset, len), + offsets, + children, + } + } + + /// Returns an [`ArrayDataLayout`] representation of this + pub(crate) fn layout(&self) -> ArrayDataLayout<'_> { + let buffers = match &self.offsets { + Some(offsets) => Buffers::two(self.type_ids.inner(), offsets.inner()), + None => Buffers::one(self.type_ids.inner()), + }; + + ArrayDataLayout { + data_type: &self.data_type, + len: self.type_ids.len(), + offset: 0, + nulls: None, + buffers, + child_data: &self.children, + } + } }