From 56a72ea4cc2bf20a3d96a8eab73ce0deb1939bc0 Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Sat, 9 Oct 2021 10:22:23 -0400 Subject: [PATCH 1/3] Replace `ArrayData::new()` with `ArrayData::try_new()` and `unsafe ArrayData::new_unchecked` --- arrow/benches/array_from_vec.rs | 8 +- arrow/examples/builders.rs | 9 +- arrow/src/array/array.rs | 167 ++++++++-------- arrow/src/array/array_binary.rs | 99 ++++++---- arrow/src/array/array_boolean.rs | 33 ++-- arrow/src/array/array_dictionary.rs | 29 +-- arrow/src/array/array_list.rs | 104 ++++++---- arrow/src/array/array_map.rs | 30 ++- arrow/src/array/array_primitive.rs | 88 +++++---- arrow/src/array/array_string.rs | 19 +- arrow/src/array/array_struct.rs | 38 ++-- arrow/src/array/array_union.rs | 8 +- arrow/src/array/builder.rs | 63 +++--- arrow/src/array/data.rs | 126 ++++++++++-- arrow/src/array/equal/mod.rs | 33 ++-- arrow/src/array/equal/utils.rs | 9 +- arrow/src/array/null.rs | 3 +- arrow/src/array/transform/mod.rs | 17 +- arrow/src/compute/kernels/arithmetic.rs | 60 +++--- arrow/src/compute/kernels/arity.rs | 26 +-- arrow/src/compute/kernels/boolean.rs | 106 +++++++---- arrow/src/compute/kernels/cast.rs | 136 +++++++------ arrow/src/compute/kernels/comparison.rs | 243 +++++++++++++----------- arrow/src/compute/kernels/filter.rs | 18 +- arrow/src/compute/kernels/length.rs | 20 +- arrow/src/compute/kernels/limit.rs | 15 +- arrow/src/compute/kernels/sort.rs | 40 ++-- arrow/src/compute/kernels/substring.rs | 26 +-- arrow/src/compute/kernels/take.rs | 107 ++++++----- arrow/src/compute/util.rs | 27 +-- arrow/src/ffi.rs | 27 +-- arrow/src/ipc/reader.rs | 31 +-- arrow/src/json/reader.rs | 96 ++++++---- arrow/src/json/writer.rs | 12 +- arrow/src/record_batch.rs | 6 +- arrow/src/util/data_gen.rs | 20 +- arrow/src/util/integration_util.rs | 3 +- integration-testing/src/lib.rs | 14 +- parquet/src/arrow/array_reader.rs | 35 ++-- parquet/src/arrow/arrow_array_reader.rs | 18 +- parquet/src/arrow/arrow_writer.rs | 36 ++-- parquet/src/arrow/levels.rs | 6 +- 42 files changed, 1218 insertions(+), 793 deletions(-) diff --git a/arrow/benches/array_from_vec.rs b/arrow/benches/array_from_vec.rs index 01ddf3503236..a589ee9ce8a5 100644 --- a/arrow/benches/array_from_vec.rs +++ b/arrow/benches/array_from_vec.rs @@ -31,9 +31,11 @@ fn array_from_vec(n: usize) { for i in 0..n { v.push((i & 0xffff) as u8); } - let arr_data = ArrayDataBuilder::new(DataType::Int32) - .add_buffer(Buffer::from(v)) - .build(); + let arr_data = unsafe { + ArrayDataBuilder::new(DataType::Int32) + .add_buffer(Buffer::from(v)) + .build_unchecked() + }; criterion::black_box(Int32Array::from(arr_data)); } diff --git a/arrow/examples/builders.rs b/arrow/examples/builders.rs index 61cce0ed97a5..0dc1d76f34f4 100644 --- a/arrow/examples/builders.rs +++ b/arrow/examples/builders.rs @@ -82,7 +82,8 @@ fn main() { .add_buffer(Buffer::from(offsets.to_byte_slice())) .add_buffer(Buffer::from(&values[..])) .null_bit_buffer(Buffer::from([0b00000101])) - .build(); + .build() + .unwrap(); let binary_array = StringArray::from(array_data); println!("{:?}", binary_array); @@ -92,7 +93,8 @@ fn main() { let value_data = ArrayData::builder(DataType::Int32) .len(8) .add_buffer(Buffer::from(&[0, 1, 2, 3, 4, 5, 6, 7].to_byte_slice())) - .build(); + .build() + .unwrap(); // Construct a buffer for value offsets, for the nested array: // [[0, 1, 2], [3, 4, 5], [6, 7]] @@ -105,7 +107,8 @@ fn main() { .len(3) .add_buffer(value_offsets) .add_child_data(value_data) - .build(); + .build() + .unwrap(); let list_array = ListArray::from(list_data); println!("{:?}", list_array); diff --git a/arrow/src/array/array.rs b/arrow/src/array/array.rs index be19fea37cc8..fcf4647666e8 100644 --- a/arrow/src/array/array.rs +++ b/arrow/src/array/array.rs @@ -377,15 +377,17 @@ pub fn new_null_array(data_type: &DataType, length: usize) -> ArrayRef { DataType::Null => Arc::new(NullArray::new(length)), DataType::Boolean => { let null_buf: Buffer = MutableBuffer::new_null(length).into(); - make_array(ArrayData::new( - data_type.clone(), - length, - Some(length), - Some(null_buf.clone()), - 0, - vec![null_buf], - vec![], - )) + make_array(unsafe { + ArrayData::new_unchecked( + data_type.clone(), + length, + Some(length), + Some(null_buf.clone()), + 0, + vec![null_buf], + vec![], + ) + }) } DataType::Int8 => new_null_sized_array::(data_type, length), DataType::UInt8 => new_null_sized_array::(data_type, length), @@ -414,15 +416,17 @@ pub fn new_null_array(data_type: &DataType, length: usize) -> ArrayRef { new_null_sized_array::(data_type, length) } }, - DataType::FixedSizeBinary(value_len) => make_array(ArrayData::new( - data_type.clone(), - length, - Some(length), - Some(MutableBuffer::new_null(length).into()), - 0, - vec![Buffer::from(vec![0u8; *value_len as usize * length])], - vec![], - )), + DataType::FixedSizeBinary(value_len) => make_array(unsafe { + ArrayData::new_unchecked( + data_type.clone(), + length, + Some(length), + Some(MutableBuffer::new_null(length).into()), + 0, + vec![Buffer::from(vec![0u8; *value_len as usize * length])], + vec![], + ) + }), DataType::Binary | DataType::Utf8 => { new_null_binary_array::(data_type, length) } @@ -435,19 +439,21 @@ pub fn new_null_array(data_type: &DataType, length: usize) -> ArrayRef { DataType::LargeList(field) => { new_null_list_array::(data_type, field.data_type(), length) } - DataType::FixedSizeList(field, value_len) => make_array(ArrayData::new( - data_type.clone(), - length, - Some(length), - Some(MutableBuffer::new_null(length).into()), - 0, - vec![], - vec![ - new_null_array(field.data_type(), *value_len as usize * length) - .data() - .clone(), - ], - )), + DataType::FixedSizeList(field, value_len) => make_array(unsafe { + ArrayData::new_unchecked( + data_type.clone(), + length, + Some(length), + Some(MutableBuffer::new_null(length).into()), + 0, + vec![], + vec![ + new_null_array(field.data_type(), *value_len as usize * length) + .data() + .clone(), + ], + ) + }), DataType::Struct(fields) => { let fields: Vec<_> = fields .iter() @@ -467,15 +473,17 @@ pub fn new_null_array(data_type: &DataType, length: usize) -> ArrayRef { let keys = new_null_array(key, length); let keys = keys.data(); - make_array(ArrayData::new( - data_type.clone(), - length, - Some(length), - keys.null_buffer().cloned(), - 0, - keys.buffers().into(), - vec![new_empty_array(value.as_ref()).data().clone()], - )) + make_array(unsafe { + ArrayData::new_unchecked( + data_type.clone(), + length, + Some(length), + keys.null_buffer().cloned(), + 0, + keys.buffers().into(), + vec![new_empty_array(value.as_ref()).data().clone()], + ) + }) } DataType::Decimal(_, _) => { unimplemented!("Creating null Decimal array not yet supported") @@ -489,17 +497,19 @@ fn new_null_list_array( child_data_type: &DataType, length: usize, ) -> ArrayRef { - make_array(ArrayData::new( - data_type.clone(), - length, - Some(length), - Some(MutableBuffer::new_null(length).into()), - 0, - vec![Buffer::from( - vec![OffsetSize::zero(); length + 1].to_byte_slice(), - )], - vec![ArrayData::new_empty(child_data_type)], - )) + make_array(unsafe { + ArrayData::new_unchecked( + data_type.clone(), + length, + Some(length), + Some(MutableBuffer::new_null(length).into()), + 0, + vec![Buffer::from( + vec![OffsetSize::zero(); length + 1].to_byte_slice(), + )], + vec![ArrayData::new_empty(child_data_type)], + ) + }) } #[inline] @@ -507,18 +517,20 @@ fn new_null_binary_array( data_type: &DataType, length: usize, ) -> ArrayRef { - make_array(ArrayData::new( - data_type.clone(), - length, - Some(length), - Some(MutableBuffer::new_null(length).into()), - 0, - vec![ - Buffer::from(vec![OffsetSize::zero(); length + 1].to_byte_slice()), - MutableBuffer::new(0).into(), - ], - vec![], - )) + make_array(unsafe { + ArrayData::new_unchecked( + data_type.clone(), + length, + Some(length), + Some(MutableBuffer::new_null(length).into()), + 0, + vec![ + Buffer::from(vec![OffsetSize::zero(); length + 1].to_byte_slice()), + MutableBuffer::new(0).into(), + ], + vec![], + ) + }) } #[inline] @@ -526,15 +538,17 @@ fn new_null_sized_array( data_type: &DataType, length: usize, ) -> ArrayRef { - make_array(ArrayData::new( - data_type.clone(), - length, - Some(length), - Some(MutableBuffer::new_null(length).into()), - 0, - vec![Buffer::from(vec![0u8; length * T::get_byte_width()])], - vec![], - )) + make_array(unsafe { + ArrayData::new_unchecked( + data_type.clone(), + length, + Some(length), + Some(MutableBuffer::new_null(length).into()), + 0, + vec![Buffer::from(vec![0u8; length * T::get_byte_width()])], + vec![], + ) + }) } /// Creates a new array from two FFI pointers. Used to import arrays from the C Data Interface @@ -755,7 +769,8 @@ mod tests { ArrayData::builder(arr.data_type().clone()) .add_buffer(MutableBuffer::new(0).into()) .null_bit_buffer(MutableBuffer::new_null(0).into()) - .build(), + .build() + .unwrap(), ); // expected size is the size of the PrimitiveArray struct, @@ -791,8 +806,10 @@ mod tests { .child_data(vec![ArrayData::builder(DataType::Int64) .len(values.len()) .buffers(values.data_ref().buffers().to_vec()) - .build()]) - .build(); + .build() + .unwrap()]) + .build() + .unwrap(); let empty_data = ArrayData::new_empty(&DataType::Dictionary( Box::new(DataType::Int16), diff --git a/arrow/src/array/array_binary.rs b/arrow/src/array/array_binary.rs index b477fc6aa812..89a3efd2caf2 100644 --- a/arrow/src/array/array_binary.rs +++ b/arrow/src/array/array_binary.rs @@ -137,8 +137,8 @@ impl GenericBinaryArray { let array_data = ArrayData::builder(OffsetSize::DATA_TYPE) .len(v.len()) .add_buffer(Buffer::from_slice_ref(&offsets)) - .add_buffer(Buffer::from_slice_ref(&values)) - .build(); + .add_buffer(Buffer::from_slice_ref(&values)); + let array_data = unsafe { array_data.build_unchecked() }; GenericBinaryArray::::from(array_data) } @@ -168,7 +168,7 @@ impl GenericBinaryArray { builder = builder.null_bit_buffer(bitmap.bits.clone()) } - let data = builder.build(); + let data = unsafe { builder.build_unchecked() }; Self::from(data) } } @@ -263,8 +263,8 @@ where .len(data_len) .add_buffer(Buffer::from_slice_ref(&offsets)) .add_buffer(Buffer::from_slice_ref(&values)) - .null_bit_buffer(null_buf.into()) - .build(); + .null_bit_buffer(null_buf.into()); + let array_data = unsafe { array_data.build_unchecked() }; Self::from(array_data) } } @@ -520,15 +520,17 @@ impl FixedSizeBinaryArray { } let size = size.unwrap_or(0); - let array_data = ArrayData::new( - DataType::FixedSizeBinary(size as i32), - len, - None, - Some(null_buf.into()), - 0, - vec![buffer.into()], - vec![], - ); + let array_data = unsafe { + ArrayData::new_unchecked( + DataType::FixedSizeBinary(size as i32), + len, + None, + Some(null_buf.into()), + 0, + vec![buffer.into()], + vec![], + ) + }; Ok(FixedSizeBinaryArray::from(array_data)) } @@ -586,8 +588,8 @@ impl FixedSizeBinaryArray { let size = size.unwrap_or(0); let array_data = ArrayData::builder(DataType::FixedSizeBinary(size as i32)) .len(len) - .add_buffer(buffer.into()) - .build(); + .add_buffer(buffer.into()); + let array_data = unsafe { array_data.build_unchecked() }; Ok(FixedSizeBinaryArray::from(array_data)) } @@ -639,7 +641,7 @@ impl From for FixedSizeBinaryArray { builder = builder.null_bit_buffer(bitmap.bits.clone()) } - let data = builder.build(); + let data = unsafe { builder.build_unchecked() }; Self::from(data) } } @@ -787,8 +789,8 @@ impl DecimalArray { builder = builder.null_bit_buffer(bitmap.bits.clone()) } - let data = builder.build(); - Self::from(data) + let array_data = unsafe { builder.build_unchecked() }; + Self::from(array_data) } pub fn precision(&self) -> usize { self.precision @@ -865,7 +867,8 @@ mod tests { .len(3) .add_buffer(Buffer::from_slice_ref(&offsets)) .add_buffer(Buffer::from_slice_ref(&values)) - .build(); + .build() + .unwrap(); let binary_array = BinaryArray::from(array_data); assert_eq!(3, binary_array.len()); assert_eq!(0, binary_array.null_count()); @@ -895,7 +898,8 @@ mod tests { .offset(1) .add_buffer(Buffer::from_slice_ref(&offsets)) .add_buffer(Buffer::from_slice_ref(&values)) - .build(); + .build() + .unwrap(); let binary_array = BinaryArray::from(array_data); assert_eq!( [b'p', b'a', b'r', b'q', b'u', b'e', b't'], @@ -919,7 +923,8 @@ mod tests { .len(3) .add_buffer(Buffer::from_slice_ref(&offsets)) .add_buffer(Buffer::from_slice_ref(&values)) - .build(); + .build() + .unwrap(); let binary_array = LargeBinaryArray::from(array_data); assert_eq!(3, binary_array.len()); assert_eq!(0, binary_array.null_count()); @@ -949,7 +954,8 @@ mod tests { .offset(1) .add_buffer(Buffer::from_slice_ref(&offsets)) .add_buffer(Buffer::from_slice_ref(&values)) - .build(); + .build() + .unwrap(); let binary_array = LargeBinaryArray::from(array_data); assert_eq!( [b'p', b'a', b'r', b'q', b'u', b'e', b't'], @@ -972,7 +978,8 @@ mod tests { let values_data = ArrayData::builder(DataType::UInt8) .len(12) .add_buffer(Buffer::from(&values[..])) - .build(); + .build() + .unwrap(); let offsets: [i32; 4] = [0, 5, 5, 12]; // Array data: ["hello", "", "parquet"] @@ -980,7 +987,8 @@ mod tests { .len(3) .add_buffer(Buffer::from_slice_ref(&offsets)) .add_buffer(Buffer::from_slice_ref(&values)) - .build(); + .build() + .unwrap(); let binary_array1 = BinaryArray::from(array_data1); let data_type = @@ -989,7 +997,8 @@ mod tests { .len(3) .add_buffer(Buffer::from_slice_ref(&offsets)) .add_child_data(values_data) - .build(); + .build() + .unwrap(); let list_array = ListArray::from(array_data2); let binary_array2 = BinaryArray::from(list_array); @@ -1016,7 +1025,8 @@ mod tests { let values_data = ArrayData::builder(DataType::UInt8) .len(12) .add_buffer(Buffer::from(&values[..])) - .build(); + .build() + .unwrap(); let offsets: [i64; 4] = [0, 5, 5, 12]; // Array data: ["hello", "", "parquet"] @@ -1024,7 +1034,8 @@ mod tests { .len(3) .add_buffer(Buffer::from_slice_ref(&offsets)) .add_buffer(Buffer::from_slice_ref(&values)) - .build(); + .build() + .unwrap(); let binary_array1 = LargeBinaryArray::from(array_data1); let data_type = @@ -1033,7 +1044,8 @@ mod tests { .len(3) .add_buffer(Buffer::from_slice_ref(&offsets)) .add_child_data(values_data) - .build(); + .build() + .unwrap(); let list_array = LargeListArray::from(array_data2); let binary_array2 = LargeBinaryArray::from(list_array); @@ -1113,7 +1125,8 @@ mod tests { let values_data = ArrayData::builder(DataType::UInt32) .len(12) .add_buffer(Buffer::from_slice_ref(&values)) - .build(); + .build() + .unwrap(); let offsets: [i32; 4] = [0, 5, 5, 12]; let data_type = @@ -1122,7 +1135,8 @@ mod tests { .len(3) .add_buffer(Buffer::from_slice_ref(&offsets)) .add_child_data(values_data) - .build(); + .build() + .unwrap(); let list_array = ListArray::from(array_data); BinaryArray::from(list_array); } @@ -1134,7 +1148,8 @@ mod tests { let array_data = ArrayData::builder(DataType::FixedSizeBinary(5)) .len(3) .add_buffer(Buffer::from(&values[..])) - .build(); + .build() + .unwrap(); let fixed_size_binary_array = FixedSizeBinaryArray::from(array_data); assert_eq!(3, fixed_size_binary_array.len()); assert_eq!(0, fixed_size_binary_array.null_count()); @@ -1162,7 +1177,8 @@ mod tests { .len(2) .offset(1) .add_buffer(Buffer::from(&values[..])) - .build(); + .build() + .unwrap(); let fixed_size_binary_array = FixedSizeBinaryArray::from(array_data); assert_eq!( [b't', b'h', b'e', b'r', b'e'], @@ -1188,8 +1204,9 @@ mod tests { let values_data = ArrayData::builder(DataType::UInt32) .len(12) .add_buffer(Buffer::from_slice_ref(&values)) - .add_child_data(ArrayData::builder(DataType::Boolean).build()) - .build(); + .add_child_data(ArrayData::builder(DataType::Boolean).build().unwrap()) + .build() + .unwrap(); let array_data = ArrayData::builder(DataType::FixedSizeList( Box::new(Field::new("item", DataType::Binary, false)), @@ -1197,7 +1214,8 @@ mod tests { )) .len(3) .add_child_data(values_data) - .build(); + .build() + .unwrap(); let list_array = FixedSizeListArray::from(array_data); FixedSizeBinaryArray::from(list_array); } @@ -1212,7 +1230,8 @@ mod tests { .len(3) .add_buffer(Buffer::from_slice_ref(&offsets)) .add_buffer(Buffer::from_slice_ref(&values)) - .build(); + .build() + .unwrap(); let binary_array = BinaryArray::from(array_data); binary_array.value(4); } @@ -1224,7 +1243,8 @@ mod tests { let array_data = ArrayData::builder(DataType::FixedSizeBinary(5)) .len(3) .add_buffer(Buffer::from(&values[..])) - .build(); + .build() + .unwrap(); let arr = FixedSizeBinaryArray::from(array_data); assert_eq!( "FixedSizeBinaryArray<5>\n[\n [104, 101, 108, 108, 111],\n [116, 104, 101, 114, 101],\n [97, 114, 114, 111, 119],\n]", @@ -1243,7 +1263,8 @@ mod tests { let array_data = ArrayData::builder(DataType::Decimal(23, 6)) .len(2) .add_buffer(Buffer::from(&values[..])) - .build(); + .build() + .unwrap(); let decimal_array = DecimalArray::from(array_data); assert_eq!(8_887_000_000, decimal_array.value(0)); assert_eq!(-8_887_000_000, decimal_array.value(1)); diff --git a/arrow/src/array/array_boolean.rs b/arrow/src/array/array_boolean.rs index 9274e65c8d69..07f3da6c4147 100644 --- a/arrow/src/array/array_boolean.rs +++ b/arrow/src/array/array_boolean.rs @@ -147,8 +147,9 @@ impl From> for BooleanArray { } let array_data = ArrayData::builder(DataType::Boolean) .len(data.len()) - .add_buffer(mut_buf.into()) - .build(); + .add_buffer(mut_buf.into()); + + let array_data = unsafe { array_data.build_unchecked() }; BooleanArray::from(array_data) } } @@ -212,15 +213,17 @@ impl>> FromIterator for BooleanArray { } }); - let data = ArrayData::new( - DataType::Boolean, - data_len, - None, - Some(null_buf.into()), - 0, - vec![val_buf.into()], - vec![], - ); + let data = unsafe { + ArrayData::new_unchecked( + DataType::Boolean, + data_len, + None, + Some(null_buf.into()), + 0, + vec![val_buf.into()], + vec![], + ) + }; BooleanArray::from(data) } } @@ -313,7 +316,8 @@ mod tests { .len(5) .offset(2) .add_buffer(buf) - .build(); + .build() + .unwrap(); let arr = BooleanArray::from(data); assert_eq!(&buf2, arr.values()); assert_eq!(5, arr.len()); @@ -328,7 +332,10 @@ mod tests { #[should_panic(expected = "BooleanArray data should contain a single buffer only \ (values buffer)")] fn test_boolean_array_invalid_buffer_len() { - let data = ArrayData::builder(DataType::Boolean).len(5).build(); + let data = ArrayData::builder(DataType::Boolean) + .len(5) + .build() + .unwrap(); BooleanArray::from(data); } } diff --git a/arrow/src/array/array_dictionary.rs b/arrow/src/array/array_dictionary.rs index de9873ccee5c..c684c253aa7b 100644 --- a/arrow/src/array/array_dictionary.rs +++ b/arrow/src/array/array_dictionary.rs @@ -130,15 +130,17 @@ impl From for DictionaryArray { panic!("DictionaryArray's data type must match.") }; // create a zero-copy of the keys' data - let keys = PrimitiveArray::::from(ArrayData::new( - T::DATA_TYPE, - data.len(), - Some(data.null_count()), - data.null_buffer().cloned(), - data.offset(), - data.buffers().to_vec(), - vec![], - )); + let keys = PrimitiveArray::::from(unsafe { + ArrayData::new_unchecked( + T::DATA_TYPE, + data.len(), + Some(data.null_count()), + data.null_buffer().cloned(), + data.offset(), + data.buffers().to_vec(), + vec![], + ) + }); let values = make_array(data.child_data()[0].clone()); Self { data, @@ -272,7 +274,8 @@ mod tests { .add_buffer(Buffer::from( &[10_i8, 11, 12, 13, 14, 15, 16, 17].to_byte_slice(), )) - .build(); + .build() + .unwrap(); // Construct a buffer for value offsets, for the nested array: let keys = Buffer::from(&[2_i16, 3, 4].to_byte_slice()); @@ -286,7 +289,8 @@ mod tests { .len(3) .add_buffer(keys.clone()) .add_child_data(value_data.clone()) - .build(); + .build() + .unwrap(); let dict_array = Int16DictionaryArray::from(dict_data); let values = dict_array.values(); @@ -305,7 +309,8 @@ mod tests { .offset(1) .add_buffer(keys) .add_child_data(value_data.clone()) - .build(); + .build() + .unwrap(); let dict_array = Int16DictionaryArray::from(dict_data); let values = dict_array.values(); diff --git a/arrow/src/array/array_list.rs b/arrow/src/array/array_list.rs index 0489271189ba..fbba8fcf412d 100644 --- a/arrow/src/array/array_list.rs +++ b/arrow/src/array/array_list.rs @@ -182,13 +182,14 @@ impl GenericListArray { } else { DataType::List(field) }; - let data = ArrayData::builder(data_type) + let array_data = ArrayData::builder(data_type) .len(null_buf.len()) .add_buffer(offsets.into()) .add_child_data(values.data().clone()) - .null_bit_buffer(null_buf.into()) - .build(); - Self::from(data) + .null_bit_buffer(null_buf.into()); + let array_data = unsafe { array_data.build_unchecked() }; + + Self::from(array_data) } } @@ -466,7 +467,8 @@ mod tests { let value_data = ArrayData::builder(DataType::Int32) .len(8) .add_buffer(Buffer::from(&[0, 1, 2, 3, 4, 5, 6, 7].to_byte_slice())) - .build(); + .build() + .unwrap(); // Construct a buffer for value offsets, for the nested array: // [[0, 1, 2], [3, 4, 5], [6, 7]] @@ -479,7 +481,8 @@ mod tests { .len(3) .add_buffer(value_offsets) .add_child_data(value_data) - .build(); + .build() + .unwrap(); ListArray::from(list_data) } @@ -502,7 +505,8 @@ mod tests { let value_data = ArrayData::builder(DataType::Int32) .len(8) .add_buffer(Buffer::from_slice_ref(&[0, 1, 2, 3, 4, 5, 6, 7])) - .build(); + .build() + .unwrap(); // Construct a buffer for value offsets, for the nested array: // [[0, 1, 2], [3, 4, 5], [6, 7]] @@ -515,7 +519,8 @@ mod tests { .len(3) .add_buffer(value_offsets.clone()) .add_child_data(value_data.clone()) - .build(); + .build() + .unwrap(); let list_array = ListArray::from(list_data); let values = list_array.values(); @@ -553,7 +558,8 @@ mod tests { .offset(1) .add_buffer(value_offsets) .add_child_data(value_data.clone()) - .build(); + .build() + .unwrap(); let list_array = ListArray::from(list_data); let values = list_array.values(); @@ -588,7 +594,8 @@ mod tests { let value_data = ArrayData::builder(DataType::Int32) .len(8) .add_buffer(Buffer::from_slice_ref(&[0, 1, 2, 3, 4, 5, 6, 7])) - .build(); + .build() + .unwrap(); // Construct a buffer for value offsets, for the nested array: // [[0, 1, 2], [3, 4, 5], [6, 7]] @@ -601,7 +608,8 @@ mod tests { .len(3) .add_buffer(value_offsets.clone()) .add_child_data(value_data.clone()) - .build(); + .build() + .unwrap(); let list_array = LargeListArray::from(list_data); let values = list_array.values(); @@ -639,7 +647,8 @@ mod tests { .offset(1) .add_buffer(value_offsets) .add_child_data(value_data.clone()) - .build(); + .build() + .unwrap(); let list_array = LargeListArray::from(list_data); let values = list_array.values(); @@ -674,7 +683,8 @@ mod tests { let value_data = ArrayData::builder(DataType::Int32) .len(9) .add_buffer(Buffer::from_slice_ref(&[0, 1, 2, 3, 4, 5, 6, 7, 8])) - .build(); + .build() + .unwrap(); // Construct a list array from the above two let list_data_type = DataType::FixedSizeList( @@ -684,7 +694,8 @@ mod tests { let list_data = ArrayData::builder(list_data_type.clone()) .len(3) .add_child_data(value_data.clone()) - .build(); + .build() + .unwrap(); let list_array = FixedSizeListArray::from(list_data); let values = list_array.values(); @@ -713,7 +724,8 @@ mod tests { .len(3) .offset(1) .add_child_data(value_data.clone()) - .build(); + .build() + .unwrap(); let list_array = FixedSizeListArray::from(list_data); let values = list_array.values(); @@ -743,7 +755,8 @@ mod tests { let value_data = ArrayData::builder(DataType::Int32) .len(8) .add_buffer(Buffer::from_slice_ref(&[0, 1, 2, 3, 4, 5, 6, 7])) - .build(); + .build() + .unwrap(); // Construct a list array from the above two let list_data_type = DataType::FixedSizeList( @@ -753,7 +766,8 @@ mod tests { let list_data = ArrayData::builder(list_data_type) .len(3) .add_child_data(value_data) - .build(); + .build() + .unwrap(); FixedSizeListArray::from(list_data); } @@ -763,7 +777,8 @@ mod tests { let value_data = ArrayData::builder(DataType::Int32) .len(10) .add_buffer(Buffer::from_slice_ref(&[0, 1, 2, 3, 4, 5, 6, 7, 8, 9])) - .build(); + .build() + .unwrap(); // Construct a buffer for value offsets, for the nested array: // [[0, 1], null, null, [2, 3], [4, 5], null, [6, 7, 8], null, [9]] @@ -784,7 +799,8 @@ mod tests { .add_buffer(value_offsets) .add_child_data(value_data.clone()) .null_bit_buffer(Buffer::from(null_bits)) - .build(); + .build() + .unwrap(); let list_array = ListArray::from(list_data); let values = list_array.values(); @@ -825,7 +841,8 @@ mod tests { let value_data = ArrayData::builder(DataType::Int32) .len(10) .add_buffer(Buffer::from_slice_ref(&[0, 1, 2, 3, 4, 5, 6, 7, 8, 9])) - .build(); + .build() + .unwrap(); // Construct a buffer for value offsets, for the nested array: // [[0, 1], null, null, [2, 3], [4, 5], null, [6, 7, 8], null, [9]] @@ -846,7 +863,8 @@ mod tests { .add_buffer(value_offsets) .add_child_data(value_data.clone()) .null_bit_buffer(Buffer::from(null_bits)) - .build(); + .build() + .unwrap(); let list_array = LargeListArray::from(list_data); let values = list_array.values(); @@ -890,7 +908,8 @@ mod tests { let value_data = ArrayData::builder(DataType::Int32) .len(10) .add_buffer(Buffer::from_slice_ref(&[0, 1, 2, 3, 4, 5, 6, 7, 8, 9])) - .build(); + .build() + .unwrap(); // Construct a buffer for value offsets, for the nested array: // [[0, 1], null, null, [2, 3], [4, 5], null, [6, 7, 8], null, [9]] @@ -911,7 +930,8 @@ mod tests { .add_buffer(value_offsets) .add_child_data(value_data) .null_bit_buffer(Buffer::from(null_bits)) - .build(); + .build() + .unwrap(); let list_array = LargeListArray::from(list_data); assert_eq!(9, list_array.len()); @@ -924,7 +944,8 @@ mod tests { let value_data = ArrayData::builder(DataType::Int32) .len(10) .add_buffer(Buffer::from_slice_ref(&[0, 1, 2, 3, 4, 5, 6, 7, 8, 9])) - .build(); + .build() + .unwrap(); // Set null buts for the nested array: // [[0, 1], null, null, [6, 7], [8, 9]] @@ -943,7 +964,8 @@ mod tests { .len(5) .add_child_data(value_data.clone()) .null_bit_buffer(Buffer::from(null_bits)) - .build(); + .build() + .unwrap(); let list_array = FixedSizeListArray::from(list_data); let values = list_array.values(); @@ -984,7 +1006,8 @@ mod tests { let value_data = ArrayData::builder(DataType::Int32) .len(10) .add_buffer(Buffer::from_slice_ref(&[0, 1, 2, 3, 4, 5, 6, 7, 8, 9])) - .build(); + .build() + .unwrap(); // Set null buts for the nested array: // [[0, 1], null, null, [6, 7], [8, 9]] @@ -1003,7 +1026,8 @@ mod tests { .len(5) .add_child_data(value_data) .null_bit_buffer(Buffer::from(null_bits)) - .build(); + .build() + .unwrap(); let list_array = FixedSizeListArray::from(list_data); list_array.value(10); @@ -1017,13 +1041,15 @@ mod tests { let value_data = ArrayData::builder(DataType::Int32) .len(8) .add_buffer(Buffer::from_slice_ref(&[0, 1, 2, 3, 4, 5, 6, 7])) - .build(); + .build() + .unwrap(); let list_data_type = DataType::List(Box::new(Field::new("item", DataType::Int32, false))); let list_data = ArrayData::builder(list_data_type) .len(3) .add_child_data(value_data) - .build(); + .build() + .unwrap(); ListArray::from(list_data); } @@ -1038,7 +1064,8 @@ mod tests { let list_data = ArrayData::builder(list_data_type) .len(3) .add_buffer(value_offsets) - .build(); + .build() + .unwrap(); ListArray::from(list_data); } @@ -1048,7 +1075,8 @@ mod tests { let value_data = ArrayData::builder(DataType::Int32) .len(8) .add_buffer(Buffer::from_slice_ref(&[0, 1, 2, 3, 4, 5, 6, 7])) - .build(); + .build() + .unwrap(); let value_offsets = Buffer::from_slice_ref(&[2, 2, 5, 7]); @@ -1058,7 +1086,8 @@ mod tests { .len(3) .add_buffer(value_offsets) .add_child_data(value_data) - .build(); + .build() + .unwrap(); ListArray::from(list_data); } @@ -1068,7 +1097,10 @@ mod tests { let ptr = alloc::allocate_aligned::(8); let buf = unsafe { Buffer::from_raw_parts(ptr, 8, 8) }; let buf2 = buf.slice(1); - let array_data = ArrayData::builder(DataType::Int32).add_buffer(buf2).build(); + let array_data = ArrayData::builder(DataType::Int32) + .add_buffer(buf2) + .build() + .unwrap(); Int32Array::from(array_data); } @@ -1082,14 +1114,16 @@ mod tests { let values: [i32; 8] = [0; 8]; let value_data = ArrayData::builder(DataType::Int32) .add_buffer(Buffer::from_slice_ref(&values)) - .build(); + .build() + .unwrap(); let list_data_type = DataType::List(Box::new(Field::new("item", DataType::Int32, false))); let list_data = ArrayData::builder(list_data_type) .add_buffer(buf2) .add_child_data(value_data) - .build(); + .build() + .unwrap(); ListArray::from(list_data); } diff --git a/arrow/src/array/array_map.rs b/arrow/src/array/array_map.rs index b10c39e43b01..bd888ff83e9b 100644 --- a/arrow/src/array/array_map.rs +++ b/arrow/src/array/array_map.rs @@ -203,13 +203,15 @@ mod tests { let keys_data = ArrayData::builder(DataType::Int32) .len(8) .add_buffer(Buffer::from(&[0, 1, 2, 3, 4, 5, 6, 7].to_byte_slice())) - .build(); + .build() + .unwrap(); let values_data = ArrayData::builder(DataType::UInt32) .len(8) .add_buffer(Buffer::from( &[0u32, 10, 20, 30, 40, 50, 60, 70].to_byte_slice(), )) - .build(); + .build() + .unwrap(); // Construct a buffer for value offsets, for the nested array: // [[0, 1, 2], [3, 4, 5], [6, 7]] @@ -235,7 +237,8 @@ mod tests { .len(3) .add_buffer(entry_offsets) .add_child_data(entry_struct.data().clone()) - .build(); + .build() + .unwrap(); MapArray::from(map_data) } @@ -245,14 +248,16 @@ mod tests { let key_data = ArrayData::builder(DataType::Int32) .len(8) .add_buffer(Buffer::from(&[0, 1, 2, 3, 4, 5, 6, 7].to_byte_slice())) - .build(); + .build() + .unwrap(); let value_data = ArrayData::builder(DataType::UInt32) .len(8) .add_buffer(Buffer::from( &[0u32, 10, 20, 0, 40, 0, 60, 70].to_byte_slice(), )) .null_bit_buffer(Buffer::from(&[0b11010110])) - .build(); + .build() + .unwrap(); // Construct a buffer for value offsets, for the nested array: // [[0, 1, 2], [3, 4, 5], [6, 7]] @@ -278,7 +283,8 @@ mod tests { .len(3) .add_buffer(entry_offsets) .add_child_data(entry_struct.data().clone()) - .build(); + .build() + .unwrap(); let map_array = MapArray::from(map_data); let values = map_array.values(); @@ -318,7 +324,8 @@ mod tests { .offset(1) .add_buffer(map_array.data().buffers()[0].clone()) .add_child_data(map_array.data().child_data()[0].clone()) - .build(); + .build() + .unwrap(); let map_array = MapArray::from(map_data); let values = map_array.values(); @@ -375,11 +382,13 @@ mod tests { let keys_data = ArrayData::builder(DataType::Int32) .len(5) .add_buffer(Buffer::from(&[3, 4, 5, 6, 7].to_byte_slice())) - .build(); + .build() + .unwrap(); let values_data = ArrayData::builder(DataType::UInt32) .len(5) .add_buffer(Buffer::from(&[30u32, 40, 50, 60, 70].to_byte_slice())) - .build(); + .build() + .unwrap(); // Construct a buffer for value offsets, for the nested array: // [[3, 4, 5], [6, 7]] @@ -405,7 +414,8 @@ mod tests { .len(2) .add_buffer(entry_offsets) .add_child_data(entry_struct.data().clone()) - .build(); + .build() + .unwrap(); let expected_map_array = MapArray::from(expected_map_data); assert_eq!(&expected_map_array, sliced_map_array) diff --git a/arrow/src/array/array_primitive.rs b/arrow/src/array/array_primitive.rs index 0878fb4449a5..a93e703946d1 100644 --- a/arrow/src/array/array_primitive.rs +++ b/arrow/src/array/array_primitive.rs @@ -124,15 +124,17 @@ impl PrimitiveArray { /// Creates a PrimitiveArray based on an iterator of values without nulls pub fn from_iter_values>(iter: I) -> Self { let val_buf: Buffer = iter.into_iter().collect(); - let data = ArrayData::new( - T::DATA_TYPE, - val_buf.len() / mem::size_of::<::Native>(), - None, - None, - 0, - vec![val_buf], - vec![], - ); + let data = unsafe { + ArrayData::new_unchecked( + T::DATA_TYPE, + val_buf.len() / mem::size_of::<::Native>(), + None, + None, + 0, + vec![val_buf], + vec![], + ) + }; PrimitiveArray::from(data) } @@ -140,15 +142,17 @@ impl PrimitiveArray { pub fn from_value(value: T::Native, count: usize) -> Self { // # Safety: length is known let val_buf = unsafe { Buffer::from_trusted_len_iter((0..count).map(|_| value)) }; - let data = ArrayData::new( - T::DATA_TYPE, - val_buf.len() / mem::size_of::<::Native>(), - None, - None, - 0, - vec![val_buf], - vec![], - ); + let data = unsafe { + ArrayData::new_unchecked( + T::DATA_TYPE, + val_buf.len() / mem::size_of::<::Native>(), + None, + None, + 0, + vec![val_buf], + vec![], + ) + }; PrimitiveArray::from(data) } } @@ -350,15 +354,17 @@ impl::Native }) .collect(); - let data = ArrayData::new( - T::DATA_TYPE, - null_buf.len(), - None, - Some(null_buf.into()), - 0, - vec![buffer], - vec![], - ); + let data = unsafe { + ArrayData::new_unchecked( + T::DATA_TYPE, + null_buf.len(), + None, + Some(null_buf.into()), + 0, + vec![buffer], + vec![], + ) + }; PrimitiveArray::from(data) } } @@ -380,8 +386,15 @@ impl PrimitiveArray { let (null, buffer) = trusted_len_unzip(iterator); - let data = - ArrayData::new(T::DATA_TYPE, len, None, Some(null), 0, vec![buffer], vec![]); + let data = ArrayData::new_unchecked( + T::DATA_TYPE, + len, + None, + Some(null), + 0, + vec![buffer], + vec![], + ); PrimitiveArray::from(data) } } @@ -395,8 +408,8 @@ macro_rules! def_numeric_from_vec { fn from(data: Vec<<$ty as ArrowPrimitiveType>::Native>) -> Self { let array_data = ArrayData::builder($ty::DATA_TYPE) .len(data.len()) - .add_buffer(Buffer::from_slice_ref(&data)) - .build(); + .add_buffer(Buffer::from_slice_ref(&data)); + let array_data = unsafe { array_data.build_unchecked() }; PrimitiveArray::from(array_data) } } @@ -446,8 +459,8 @@ impl PrimitiveArray { let array_data = ArrayData::builder(DataType::Timestamp(T::get_time_unit(), timezone)) .len(data.len()) - .add_buffer(Buffer::from_slice_ref(&data)) - .build(); + .add_buffer(Buffer::from_slice_ref(&data)); + let array_data = unsafe { array_data.build_unchecked() }; PrimitiveArray::from(array_data) } } @@ -476,8 +489,8 @@ impl PrimitiveArray { ArrayData::builder(DataType::Timestamp(T::get_time_unit(), timezone)) .len(data_len) .add_buffer(val_buf.into()) - .null_bit_buffer(null_buf.into()) - .build(); + .null_bit_buffer(null_buf.into()); + let array_data = unsafe { array_data.build_unchecked() }; PrimitiveArray::from(array_data) } } @@ -887,7 +900,8 @@ mod tests { .len(5) .offset(2) .add_buffer(buf) - .build(); + .build() + .unwrap(); let arr = Int32Array::from(data); assert_eq!(buf2, arr.data.buffers()[0]); assert_eq!(5, arr.len()); @@ -936,7 +950,7 @@ mod tests { #[should_panic(expected = "PrimitiveArray data should contain a single buffer only \ (values buffer)")] fn test_primitive_array_invalid_buffer_len() { - let data = ArrayData::builder(DataType::Int32).len(5).build(); + let data = ArrayData::builder(DataType::Int32).len(5).build().unwrap(); Int32Array::from(data); } diff --git a/arrow/src/array/array_string.rs b/arrow/src/array/array_string.rs index d72dbb2d9d02..c07f34a6b726 100644 --- a/arrow/src/array/array_string.rs +++ b/arrow/src/array/array_string.rs @@ -133,8 +133,8 @@ impl GenericStringArray { builder = builder.null_bit_buffer(bitmap.bits.clone()) } - let data = builder.build(); - Self::from(data) + let array_data = unsafe { builder.build_unchecked() }; + Self::from(array_data) } pub(crate) fn from_vec(v: Vec) -> Self @@ -156,8 +156,8 @@ impl GenericStringArray { let array_data = ArrayData::builder(OffsetSize::DATA_TYPE) .len(v.len()) .add_buffer(offsets.into()) - .add_buffer(values.into()) - .build(); + .add_buffer(values.into()); + let array_data = unsafe { array_data.build_unchecked() }; Self::from(array_data) } @@ -190,8 +190,8 @@ impl GenericStringArray { let array_data = ArrayData::builder(OffsetSize::DATA_TYPE) .len(data_len) .add_buffer(offsets.into()) - .add_buffer(values.into()) - .build(); + .add_buffer(values.into()); + let array_data = unsafe { array_data.build_unchecked() }; Self::from(array_data) } } @@ -249,8 +249,8 @@ where .len(data_len) .add_buffer(offsets.into()) .add_buffer(values.into()) - .null_bit_buffer(null_buf.into()) - .build(); + .null_bit_buffer(null_buf.into()); + let array_data = unsafe { array_data.build_unchecked() }; Self::from(array_data) } } @@ -475,7 +475,8 @@ mod tests { .len(3) .add_buffer(Buffer::from_slice_ref(&offsets)) .add_buffer(Buffer::from_slice_ref(&values)) - .build(); + .build() + .unwrap(); let string_array = StringArray::from(array_data); string_array.value(4); } diff --git a/arrow/src/array/array_struct.rs b/arrow/src/array/array_struct.rs index 4f7a609a14af..a1cab7f50c70 100644 --- a/arrow/src/array/array_struct.rs +++ b/arrow/src/array/array_struct.rs @@ -181,7 +181,9 @@ impl TryFrom> for StructArray { builder = builder.null_bit_buffer(null_buffer); } - Ok(StructArray::from(builder.build())) + let array_data = unsafe { builder.build_unchecked() }; + + Ok(StructArray::from(array_data)) } } @@ -219,11 +221,11 @@ impl From> for StructArray { ) } - let data = ArrayData::builder(DataType::Struct(field_types)) + let array_data = ArrayData::builder(DataType::Struct(field_types)) .child_data(field_values.into_iter().map(|a| a.data().clone()).collect()) - .len(length) - .build(); - Self::from(data) + .len(length); + let array_data = unsafe { array_data.build_unchecked() }; + Self::from(array_data) } } @@ -265,12 +267,12 @@ impl From<(Vec<(Field, ArrayRef)>, Buffer)> for StructArray { ) } - let data = ArrayData::builder(DataType::Struct(field_types)) + let array_data = ArrayData::builder(DataType::Struct(field_types)) .null_bit_buffer(pair.1) .child_data(field_values.into_iter().map(|a| a.data().clone()).collect()) - .len(length) - .build(); - Self::from(data) + .len(length); + let array_data = unsafe { array_data.build_unchecked() }; + Self::from(array_data) } } @@ -305,7 +307,8 @@ mod tests { .len(4) .add_child_data(boolean_data.clone()) .add_child_data(int_data.clone()) - .build(); + .build() + .unwrap(); let struct_array = StructArray::from(struct_array_data); assert_eq!(boolean_data, struct_array.column(0).data()); @@ -364,13 +367,15 @@ mod tests { .null_bit_buffer(Buffer::from(&[9_u8])) .add_buffer(Buffer::from(&[0, 3, 3, 3, 7].to_byte_slice())) .add_buffer(Buffer::from(b"joemark")) - .build(); + .build() + .unwrap(); let expected_int_data = ArrayData::builder(DataType::Int32) .len(4) .null_bit_buffer(Buffer::from(&[11_u8])) .add_buffer(Buffer::from(&[1, 2, 0, 4].to_byte_slice())) - .build(); + .build() + .unwrap(); assert_eq!(expected_string_data, *arr.column(0).data()); assert_eq!(expected_int_data, *arr.column(1).data()); @@ -422,12 +427,14 @@ mod tests { .len(5) .add_buffer(Buffer::from([0b00010000])) .null_bit_buffer(Buffer::from([0b00010001])) - .build(); + .build() + .unwrap(); let int_data = ArrayData::builder(DataType::Int32) .len(5) .add_buffer(Buffer::from([0, 28, 42, 0, 0].to_byte_slice())) .null_bit_buffer(Buffer::from([0b00000110])) - .build(); + .build() + .unwrap(); let mut field_types = vec![]; field_types.push(Field::new("a", DataType::Boolean, false)); @@ -437,7 +444,8 @@ mod tests { .add_child_data(boolean_data.clone()) .add_child_data(int_data.clone()) .null_bit_buffer(Buffer::from([0b00010111])) - .build(); + .build() + .unwrap(); let struct_array = StructArray::from(struct_array_data); assert_eq!(5, struct_array.len()); diff --git a/arrow/src/array/array_union.rs b/arrow/src/array/array_union.rs index b701b86285b8..ba563ec796b4 100644 --- a/arrow/src/array/array_union.rs +++ b/arrow/src/array/array_union.rs @@ -82,9 +82,11 @@ impl UnionArray { if let Some(bitmap) = bitmap_data { builder = builder.null_bit_buffer(bitmap) } - let data = match value_offsets { - Some(b) => builder.add_buffer(b).build(), - None => builder.build(), + let data = unsafe { + match value_offsets { + Some(b) => builder.add_buffer(b).build_unchecked(), + None => builder.build_unchecked(), + } }; Self::from(data) } diff --git a/arrow/src/array/builder.rs b/arrow/src/array/builder.rs index 50a931946b46..60f76d95485f 100644 --- a/arrow/src/array/builder.rs +++ b/arrow/src/array/builder.rs @@ -601,8 +601,8 @@ impl BooleanBuilder { if null_count > 0 { builder = builder.null_bit_buffer(null_bit_buffer); } - let data = builder.build(); - BooleanArray::from(data) + let array_data = unsafe { builder.build_unchecked() }; + BooleanArray::from(array_data) } } @@ -800,8 +800,8 @@ impl PrimitiveBuilder { if null_count > 0 { builder = builder.null_bit_buffer(null_bit_buffer.unwrap()); } - let data = builder.build(); - PrimitiveArray::::from(data) + let array_data = unsafe { builder.build_unchecked() }; + PrimitiveArray::::from(array_data) } /// Builds the `DictionaryArray` and reset this builder. @@ -824,7 +824,8 @@ impl PrimitiveBuilder { builder = builder.null_bit_buffer(null_bit_buffer.unwrap()); } builder = builder.add_child_data(values.data().clone()); - DictionaryArray::::from(builder.build()) + let array_data = unsafe { builder.build_unchecked() }; + DictionaryArray::::from(array_data) } fn materialize_bitmap_builder(&mut self) { @@ -952,14 +953,15 @@ where } else { DataType::List(field) }; - let data = ArrayData::builder(data_type) + let array_data = ArrayData::builder(data_type) .len(len) .add_buffer(offset_buffer) .add_child_data(values_data.clone()) - .null_bit_buffer(null_bit_buffer) - .build(); + .null_bit_buffer(null_bit_buffer); - GenericListArray::::from(data) + let array_data = unsafe { array_data.build_unchecked() }; + + GenericListArray::::from(array_data) } } @@ -1080,16 +1082,17 @@ where } let null_bit_buffer = self.bitmap_builder.finish(); - let data = ArrayData::builder(DataType::FixedSizeList( + let array_data = ArrayData::builder(DataType::FixedSizeList( Box::new(Field::new("item", values_data.data_type().clone(), true)), self.list_len, )) .len(len) .add_child_data(values_data.clone()) - .null_bit_buffer(null_bit_buffer) - .build(); + .null_bit_buffer(null_bit_buffer); + + let array_data = unsafe { array_data.build_unchecked() }; - FixedSizeListArray::from(data) + FixedSizeListArray::from(array_data) } } @@ -1677,7 +1680,8 @@ impl StructBuilder { self.len = 0; - StructArray::from(builder.build()) + let array_data = unsafe { builder.build_unchecked() }; + StructArray::from(array_data) } } @@ -1801,14 +1805,15 @@ impl MapBuilder { struct_array.data_type().clone(), false, // always non-nullable )); - let data = ArrayData::builder(DataType::Map(map_field, false)) // TODO: support sorted keys + let array_data = ArrayData::builder(DataType::Map(map_field, false)) // TODO: support sorted keys .len(len) .add_buffer(offset_buffer) .add_child_data(struct_array.data().clone()) - .null_bit_buffer(null_bit_buffer) - .build(); + .null_bit_buffer(null_bit_buffer); - MapArray::from(data) + let array_data = unsafe { array_data.build_unchecked() }; + + MapArray::from(array_data) } } @@ -2131,9 +2136,13 @@ impl UnionBuilder { .add_buffer(buffer) .len(slots); // .build(); - let arr_data_ref = match bitmap_builder { - Some(mut bb) => arr_data_builder.null_bit_buffer(bb.finish()).build(), - None => arr_data_builder.build(), + let arr_data_ref = unsafe { + match bitmap_builder { + Some(mut bb) => arr_data_builder + .null_bit_buffer(bb.finish()) + .build_unchecked(), + None => arr_data_builder.build_unchecked(), + } }; let array_ref = make_array(arr_data_ref); children.push((type_id, (Field::new(&name, data_type, false), array_ref))) @@ -3453,13 +3462,15 @@ mod tests { .null_bit_buffer(Buffer::from(&[9_u8])) .add_buffer(Buffer::from_slice_ref(&[0, 3, 3, 3, 7])) .add_buffer(Buffer::from_slice_ref(b"joemark")) - .build(); + .build() + .unwrap(); let expected_int_data = ArrayData::builder(DataType::Int32) .len(4) .null_bit_buffer(Buffer::from_slice_ref(&[11_u8])) .add_buffer(Buffer::from_slice_ref(&[1, 2, 0, 4])) - .build(); + .build() + .unwrap(); assert_eq!(expected_string_data, *arr.column(0).data()); assert_eq!(expected_int_data, *arr.column(1).data()); @@ -3565,13 +3576,15 @@ mod tests { .null_bit_buffer(Buffer::from(&[9_u8])) .add_buffer(Buffer::from_slice_ref(&[0, 3, 3, 3, 7])) .add_buffer(Buffer::from_slice_ref(b"joemark")) - .build(); + .build() + .unwrap(); let expected_int_data = ArrayData::builder(DataType::Int32) .len(4) .null_bit_buffer(Buffer::from_slice_ref(&[11_u8])) .add_buffer(Buffer::from_slice_ref(&[1, 2, 0, 4])) - .build(); + .build() + .unwrap(); assert_eq!(&expected_string_data, arr.keys().data()); assert_eq!(&expected_int_data, arr.values().data()); diff --git a/arrow/src/array/data.rs b/arrow/src/array/data.rs index cb389cacc7f6..dbc54342b034 100644 --- a/arrow/src/array/data.rs +++ b/arrow/src/array/data.rs @@ -22,6 +22,7 @@ use std::mem; use std::sync::Arc; use crate::datatypes::{DataType, IntervalUnit}; +use crate::error::Result; use crate::{bitmap::Bitmap, datatypes::ArrowNativeType}; use crate::{ buffer::{Buffer, MutableBuffer}, @@ -239,7 +240,20 @@ pub struct ArrayData { pub type ArrayDataRef = Arc; impl ArrayData { - pub fn new( + /// Create a new ArrayData instance; + /// + /// If `null_count` is not specified, the number of nulls in + /// null_bit_buffer is calculated + /// + /// # Safety + /// + /// The input values *must* form a valid Arrow array for + /// `data_type`, or undefined behavior can results. + /// + /// Note: This is a low level API and most users of the arrow + /// crate should create arrays using the methods in the `array` + /// module. + pub unsafe fn new_unchecked( data_type: DataType, len: usize, null_count: Option, @@ -264,6 +278,53 @@ impl ArrayData { } } + /// Create a new ArrayData, validating that the provided buffers + /// form a valid Arrow array of the specified data type. + /// + /// If `null_count` is not specified, the number of nulls in + /// null_bit_buffer is calculated + /// + /// Note: This is a low level API and most users of the arrow + /// crate should create arrays using the methods in the `array` + /// module. + pub fn try_new( + data_type: DataType, + len: usize, + null_count: Option, + null_bit_buffer: Option, + offset: usize, + buffers: Vec, + child_data: Vec, + ) -> Result { + // Safetly justification: `validate` is (will be) called below + let new_self = unsafe { + Self::new_unchecked( + data_type, + len, + null_count, + null_bit_buffer, + offset, + buffers, + child_data, + ) + }; + + new_self.validate()?; + Ok(new_self) + } + + /// Validates that buffers in this ArrayData are sufficiently + /// sized, to store `len` + `offset` total elements of + /// `data_type`. + /// + /// This check is "cheap" in the sense that it does not validate the + /// contents of the buffers (e.g. that string offsets for UTF8 arrays + /// are within the length of the buffer). + pub fn validate(&self) -> Result<()> { + // will be filled in a subsequent PR + Ok(()) + } + /// Returns a builder to construct a `ArrayData` instance. #[inline] pub const fn builder(data_type: DataType) -> ArrayDataBuilder { @@ -485,7 +546,18 @@ impl ArrayData { DataType::Float16 => unreachable!(), }; - Self::new(data_type.clone(), 0, Some(0), None, 0, buffers, child_data) + // Data was constructed correctly above + unsafe { + Self::new_unchecked( + data_type.clone(), + 0, + Some(0), + None, + 0, + buffers, + child_data, + ) + } } } @@ -564,8 +636,27 @@ impl ArrayDataBuilder { self } - pub fn build(self) -> ArrayData { - ArrayData::new( + /// Creates an array data, without any validation + /// + /// # Safety + /// + /// The same caveats as [`ArrayData::new_unchecked`] + /// apply. + pub unsafe fn build_unchecked(self) -> ArrayData { + ArrayData::new_unchecked( + self.data_type, + self.len, + self.null_count, + self.null_bit_buffer, + self.offset, + self.buffers, + self.child_data, + ) + } + + /// Creates an array data, validating all inputs + pub fn build(self) -> Result { + ArrayData::try_new( self.data_type, self.len, self.null_count, @@ -587,7 +678,8 @@ mod tests { #[test] fn test_new() { let arr_data = - ArrayData::new(DataType::Boolean, 10, Some(1), None, 2, vec![], vec![]); + ArrayData::try_new(DataType::Boolean, 10, Some(1), None, 2, vec![], vec![]) + .unwrap(); assert_eq!(10, arr_data.len()); assert_eq!(1, arr_data.null_count()); assert_eq!(2, arr_data.offset()); @@ -597,7 +689,7 @@ mod tests { #[test] fn test_builder() { - let child_arr_data = ArrayData::new( + let child_arr_data = ArrayData::try_new( DataType::Int32, 5, Some(0), @@ -605,7 +697,8 @@ mod tests { 0, vec![Buffer::from_slice_ref(&[1i32, 2, 3, 4, 5])], vec![], - ); + ) + .unwrap(); let v = vec![0, 1, 2, 3]; let b1 = Buffer::from(&v[..]); let arr_data = ArrayData::builder(DataType::Int32) @@ -616,7 +709,8 @@ mod tests { 0b01011111, 0b10110101, 0b01100011, 0b00011110, ])) .add_child_data(child_arr_data.clone()) - .build(); + .build() + .unwrap(); assert_eq!(20, arr_data.len()); assert_eq!(10, arr_data.null_count()); @@ -636,7 +730,8 @@ mod tests { let arr_data = ArrayData::builder(DataType::Int32) .len(16) .null_bit_buffer(Buffer::from(bit_v)) - .build(); + .build() + .unwrap(); assert_eq!(13, arr_data.null_count()); // Test with offset @@ -648,7 +743,8 @@ mod tests { .len(12) .offset(2) .null_bit_buffer(Buffer::from(bit_v)) - .build(); + .build() + .unwrap(); assert_eq!(10, arr_data.null_count()); } @@ -661,7 +757,8 @@ mod tests { let arr_data = ArrayData::builder(DataType::Int32) .len(16) .null_bit_buffer(Buffer::from(bit_v)) - .build(); + .build() + .unwrap(); assert!(arr_data.null_buffer().is_some()); assert_eq!(&bit_v, arr_data.null_buffer().unwrap().as_slice()); } @@ -675,7 +772,8 @@ mod tests { let data = ArrayData::builder(DataType::Int32) .len(16) .null_bit_buffer(Buffer::from(bit_v)) - .build(); + .build() + .unwrap(); let new_data = data.slice(1, 15); assert_eq!(data.len() - 1, new_data.len()); assert_eq!(1, new_data.offset()); @@ -690,8 +788,8 @@ mod tests { #[test] fn test_equality() { - let int_data = ArrayData::builder(DataType::Int32).build(); - let float_data = ArrayData::builder(DataType::Float32).build(); + let int_data = ArrayData::builder(DataType::Int32).build().unwrap(); + let float_data = ArrayData::builder(DataType::Float32).build().unwrap(); assert_ne!(int_data, float_data); } diff --git a/arrow/src/array/equal/mod.rs b/arrow/src/array/equal/mod.rs index 8368717c6747..15d41a0d67d6 100644 --- a/arrow/src/array/equal/mod.rs +++ b/arrow/src/array/equal/mod.rs @@ -682,7 +682,8 @@ mod tests { .add_buffer(Buffer::from(vec![0i32, 2, 3, 4, 6, 7, 8].to_byte_slice())) .add_child_data(c_values.data().clone()) .null_bit_buffer(Buffer::from(vec![0b00001001])) - .build(); + .build() + .unwrap(); let d_values = Int32Array::from(vec![ Some(1), @@ -703,7 +704,8 @@ mod tests { .add_buffer(Buffer::from(vec![0i32, 2, 3, 4, 6, 7, 8].to_byte_slice())) .add_child_data(d_values.data().clone()) .null_bit_buffer(Buffer::from(vec![0b00001001])) - .build(); + .build() + .unwrap(); test_equal(&c, &d, true); } @@ -1054,7 +1056,8 @@ mod tests { .len(5) .add_child_data(strings.data_ref().clone()) .add_child_data(ints.data_ref().clone()) - .build(); + .build() + .unwrap(); let a = crate::array::make_array(a); let b = ArrayData::builder(DataType::Struct(vec![ @@ -1065,7 +1068,8 @@ mod tests { .len(5) .add_child_data(strings.data_ref().clone()) .add_child_data(ints_non_null.data_ref().clone()) - .build(); + .build() + .unwrap(); let b = crate::array::make_array(b); test_equal(a.data_ref(), b.data_ref(), true); @@ -1080,7 +1084,8 @@ mod tests { .len(5) .add_child_data(strings.data_ref().clone()) .add_child_data(c_ints_non_null.data_ref().clone()) - .build(); + .build() + .unwrap(); let c = crate::array::make_array(c); test_equal(a.data_ref(), c.data_ref(), false); @@ -1094,7 +1099,8 @@ mod tests { .null_bit_buffer(Buffer::from(vec![0b00011110])) .len(5) .add_child_data(a.data_ref().clone()) - .build(); + .build() + .unwrap(); let a = crate::array::make_array(a); // reconstruct b, but with different data where the first struct is null @@ -1113,7 +1119,8 @@ mod tests { .len(5) .add_child_data(strings.data_ref().clone()) .add_child_data(ints_non_null.data_ref().clone()) - .build(); + .build() + .unwrap(); let b = ArrayData::builder(DataType::Struct(vec![Field::new( "f3", @@ -1123,7 +1130,8 @@ mod tests { .null_bit_buffer(Buffer::from(vec![0b00011110])) .len(5) .add_child_data(b) - .build(); + .build() + .unwrap(); let b = crate::array::make_array(b); test_equal(a.data_ref(), b.data_ref(), true); @@ -1155,7 +1163,8 @@ mod tests { .null_bit_buffer(Buffer::from(vec![0b00001010])) .len(5) .add_child_data(strings1.data_ref().clone()) - .build(); + .build() + .unwrap(); let a = crate::array::make_array(a); let b = ArrayData::builder(DataType::Struct(vec![Field::new( @@ -1166,7 +1175,8 @@ mod tests { .null_bit_buffer(Buffer::from(vec![0b00001010])) .len(5) .add_child_data(strings2.data_ref().clone()) - .build(); + .build() + .unwrap(); let b = crate::array::make_array(b); test_equal(a.data_ref(), b.data_ref(), true); @@ -1187,7 +1197,8 @@ mod tests { .null_bit_buffer(Buffer::from(vec![0b00001011])) .len(5) .add_child_data(strings3.data_ref().clone()) - .build(); + .build() + .unwrap(); let c = crate::array::make_array(c); test_equal(a.data_ref(), c.data_ref(), false); diff --git a/arrow/src/array/equal/utils.rs b/arrow/src/array/equal/utils.rs index 1e33a867c83b..8eb988cb9a98 100644 --- a/arrow/src/array/equal/utils.rs +++ b/arrow/src/array/equal/utils.rs @@ -214,7 +214,8 @@ mod tests { .add_buffer(Buffer::from( vec![1i32, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11].to_byte_slice(), )) - .build(); + .build() + .unwrap(); let data = ArrayData::builder(DataType::List(Box::new(Field::new( "item", @@ -225,7 +226,8 @@ mod tests { .add_buffer(Buffer::from(vec![0, 0, 3, 5, 6, 9, 10, 11].to_byte_slice())) .null_bit_buffer(Buffer::from(vec![0b01011010])) .add_child_data(child_data.clone()) - .build(); + .build() + .unwrap(); // Get the child logical null buffer. The child is non-nullable, but because the list has nulls, // we expect the child to logically have some nulls, inherited from the parent: @@ -250,7 +252,8 @@ mod tests { // the null_bit_buffer doesn't have an offset, i.e. cleared the 3 offset bits 0b[---]01011[010] .null_bit_buffer(Buffer::from(vec![0b00001011])) .add_child_data(child_data) - .build(); + .build() + .unwrap(); let nulls = child_logical_null_buffer( &data, diff --git a/arrow/src/array/null.rs b/arrow/src/array/null.rs index 521d472f6df5..310c04a9c00b 100644 --- a/arrow/src/array/null.rs +++ b/arrow/src/array/null.rs @@ -52,7 +52,8 @@ impl NullArray { /// other [`DataType`]. /// pub fn new(length: usize) -> Self { - let array_data = ArrayData::builder(DataType::Null).len(length).build(); + let array_data = ArrayData::builder(DataType::Null).len(length); + let array_data = unsafe { array_data.build_unchecked() }; NullArray::from(array_data) } } diff --git a/arrow/src/array/transform/mod.rs b/arrow/src/array/transform/mod.rs index 69092c1af55d..a598f0d7167e 100644 --- a/arrow/src/array/transform/mod.rs +++ b/arrow/src/array/transform/mod.rs @@ -638,7 +638,7 @@ impl<'a> MutableArrayData<'a> { /// Creates a [ArrayData] from the pushed regions up to this point, consuming `self`. pub fn freeze(self) -> ArrayData { - self.data.freeze(self.dictionary).build() + unsafe { self.data.freeze(self.dictionary).build_unchecked() } } /// Creates a [ArrayDataBuilder] from the pushed regions up to this point, consuming `self`. @@ -1150,7 +1150,7 @@ mod tests { ]); let list_value_offsets = Buffer::from_slice_ref(&[0i32, 3, 5, 11, 13, 13, 15, 15, 17]); - let expected_list_data = ArrayData::new( + let expected_list_data = ArrayData::try_new( DataType::List(Box::new(Field::new("item", DataType::Int64, true))), 8, None, @@ -1158,7 +1158,8 @@ mod tests { 0, vec![list_value_offsets], vec![expected_int_array.data().clone()], - ); + ) + .unwrap(); assert_eq!(finished, expected_list_data); Ok(()) @@ -1231,7 +1232,7 @@ mod tests { ]); let list_value_offsets = Buffer::from_slice_ref(&[0, 3, 5, 5, 13, 15, 15, 15, 19, 19, 19, 19, 23]); - let expected_list_data = ArrayData::new( + let expected_list_data = ArrayData::try_new( DataType::List(Box::new(Field::new("item", DataType::Int64, true))), 12, None, @@ -1239,7 +1240,8 @@ mod tests { 0, vec![list_value_offsets], vec![expected_int_array.data().clone()], - ); + ) + .unwrap(); assert_eq!(result, expected_list_data); Ok(()) @@ -1302,7 +1304,7 @@ mod tests { // extend b[0..0] ]); let list_value_offsets = Buffer::from_slice_ref(&[0, 3, 5, 6, 9, 10, 13]); - let expected_list_data = ArrayData::new( + let expected_list_data = ArrayData::try_new( DataType::List(Box::new(Field::new("item", DataType::Utf8, true))), 6, None, @@ -1310,7 +1312,8 @@ mod tests { 0, vec![list_value_offsets], vec![expected_string_array.data().clone()], - ); + ) + .unwrap(); assert_eq!(result, expected_list_data); Ok(()) } diff --git a/arrow/src/compute/kernels/arithmetic.rs b/arrow/src/compute/kernels/arithmetic.rs index b9596ee8cbd6..a15a9b306f0a 100644 --- a/arrow/src/compute/kernels/arithmetic.rs +++ b/arrow/src/compute/kernels/arithmetic.rs @@ -182,15 +182,17 @@ where // `values` is an iterator with a known size. let buffer = unsafe { Buffer::from_trusted_len_iter(values) }; - let data = ArrayData::new( - T::DATA_TYPE, - left.len(), - None, - null_bit_buffer, - 0, - vec![buffer], - vec![], - ); + let data = unsafe { + ArrayData::new_unchecked( + T::DATA_TYPE, + left.len(), + None, + null_bit_buffer, + 0, + vec![buffer], + vec![], + ) + }; Ok(PrimitiveArray::::from(data)) } @@ -250,15 +252,17 @@ where unsafe { Buffer::try_from_trusted_len_iter(values) } }?; - let data = ArrayData::new( - T::DATA_TYPE, - left.len(), - None, - null_bit_buffer, - 0, - vec![buffer], - vec![], - ); + let data = unsafe { + ArrayData::new_unchecked( + T::DATA_TYPE, + left.len(), + None, + null_bit_buffer, + 0, + vec![buffer], + vec![], + ) + }; Ok(PrimitiveArray::::from(data)) } @@ -318,15 +322,17 @@ where unsafe { Buffer::try_from_trusted_len_iter(values) } }?; - let data = ArrayData::new( - T::DATA_TYPE, - left.len(), - None, - null_bit_buffer, - 0, - vec![buffer], - vec![], - ); + let data = unsafe { + ArrayData::new_unchecked( + T::DATA_TYPE, + left.len(), + None, + null_bit_buffer, + 0, + vec![buffer], + vec![], + ) + }; Ok(PrimitiveArray::::from(data)) } diff --git a/arrow/src/compute/kernels/arity.rs b/arrow/src/compute/kernels/arity.rs index d7beae605993..41206e001d77 100644 --- a/arrow/src/compute/kernels/arity.rs +++ b/arrow/src/compute/kernels/arity.rs @@ -26,18 +26,20 @@ fn into_primitive_array_data( array: &PrimitiveArray, buffer: Buffer, ) -> ArrayData { - ArrayData::new( - O::DATA_TYPE, - array.len(), - None, - array - .data_ref() - .null_buffer() - .map(|b| b.bit_slice(array.offset(), array.len())), - 0, - vec![buffer], - vec![], - ) + unsafe { + ArrayData::new_unchecked( + O::DATA_TYPE, + array.len(), + None, + array + .data_ref() + .null_buffer() + .map(|b| b.bit_slice(array.offset(), array.len())), + 0, + vec![buffer], + vec![], + ) + } } /// Applies an unary and infalible function to a primitive array. diff --git a/arrow/src/compute/kernels/boolean.rs b/arrow/src/compute/kernels/boolean.rs index fcd1fb014b32..f9e17839e237 100644 --- a/arrow/src/compute/kernels/boolean.rs +++ b/arrow/src/compute/kernels/boolean.rs @@ -159,15 +159,17 @@ where let bool_buffer: Buffer = value_buffer.into(); let bool_valid_buffer: Buffer = valid_buffer.into(); - let array_data = ArrayData::new( - DataType::Boolean, - len, - None, - Some(bool_valid_buffer), - left_offset, - vec![bool_buffer], - vec![], - ); + let array_data = unsafe { + ArrayData::new_unchecked( + DataType::Boolean, + len, + None, + Some(bool_valid_buffer), + left_offset, + vec![bool_buffer], + vec![], + ) + }; Ok(BooleanArray::from(array_data)) } @@ -200,15 +202,17 @@ where let values = op(left_buffer, left_offset, right_buffer, right_offset, len); - let data = ArrayData::new( - DataType::Boolean, - len, - None, - null_bit_buffer, - 0, - vec![values], - vec![], - ); + let data = unsafe { + ArrayData::new_unchecked( + DataType::Boolean, + len, + None, + null_bit_buffer, + 0, + vec![values], + vec![], + ) + }; Ok(BooleanArray::from(data)) } @@ -380,15 +384,17 @@ pub fn not(left: &BooleanArray) -> Result { let values = buffer_unary_not(&data.buffers()[0], left_offset, len); - let data = ArrayData::new( - DataType::Boolean, - len, - None, - null_bit_buffer, - 0, - vec![values], - vec![], - ); + let data = unsafe { + ArrayData::new_unchecked( + DataType::Boolean, + len, + None, + null_bit_buffer, + 0, + vec![values], + vec![], + ) + }; Ok(BooleanArray::from(data)) } @@ -418,8 +424,17 @@ pub fn is_null(input: &dyn Array) -> Result { Some(buffer) => buffer_unary_not(buffer, input.offset(), len), }; - let data = - ArrayData::new(DataType::Boolean, len, None, None, 0, vec![output], vec![]); + let data = unsafe { + ArrayData::new_unchecked( + DataType::Boolean, + len, + None, + None, + 0, + vec![output], + vec![], + ) + }; Ok(BooleanArray::from(data)) } @@ -452,8 +467,17 @@ pub fn is_not_null(input: &dyn Array) -> Result { Some(buffer) => buffer.bit_slice(input.offset(), len), }; - let data = - ArrayData::new(DataType::Boolean, len, None, None, 0, vec![output], vec![]); + let data = unsafe { + ArrayData::new_unchecked( + DataType::Boolean, + len, + None, + None, + 0, + vec![output], + vec![], + ) + }; Ok(BooleanArray::from(data)) } @@ -537,15 +561,17 @@ where // Construct new array with same values but modified null bitmap // TODO: shift data buffer as needed - let data = ArrayData::new( - T::DATA_TYPE, - left.len(), - None, // force new to compute the number of null bits - modified_null_buffer, - 0, // No need for offset since left data has been shifted - data_buffers, - left_data.child_data().to_vec(), - ); + let data = unsafe { + ArrayData::new_unchecked( + T::DATA_TYPE, + left.len(), + None, // force new to compute the number of null bits + modified_null_buffer, + 0, // No need for offset since left data has been shifted + data_buffers, + left_data.child_data().to_vec(), + ) + }; Ok(PrimitiveArray::::from(data)) } diff --git a/arrow/src/compute/kernels/cast.rs b/arrow/src/compute/kernels/cast.rs index a0847d106332..b882019c4034 100644 --- a/arrow/src/compute/kernels/cast.rs +++ b/arrow/src/compute/kernels/cast.rs @@ -989,15 +989,17 @@ fn cast_array_data(array: &ArrayRef, to_type: DataType) -> Result where TO: ArrowNumericType, { - let data = ArrayData::new( - to_type, - array.len(), - Some(array.null_count()), - array.data().null_bitmap().clone().map(|bitmap| bitmap.bits), - array.data().offset(), - array.data().buffers().to_vec(), - vec![], - ); + let data = unsafe { + ArrayData::new_unchecked( + to_type, + array.len(), + Some(array.null_count()), + array.data().null_bitmap().clone().map(|bitmap| bitmap.bits), + array.data().offset(), + array.data().buffers().to_vec(), + vec![], + ) + }; Ok(Arc::new(PrimitiveArray::::from(data)) as ArrayRef) } @@ -1432,19 +1434,21 @@ fn dictionary_cast( } // keys are data, child_data is values (dictionary) - let data = ArrayData::new( - to_type.clone(), - cast_keys.len(), - Some(cast_keys.null_count()), - cast_keys - .data() - .null_bitmap() - .clone() - .map(|bitmap| bitmap.bits), - cast_keys.data().offset(), - cast_keys.data().buffers().to_vec(), - vec![cast_values.data().clone()], - ); + let data = unsafe { + ArrayData::new_unchecked( + to_type.clone(), + cast_keys.len(), + Some(cast_keys.null_count()), + cast_keys + .data() + .null_bitmap() + .clone() + .map(|bitmap| bitmap.bits), + cast_keys.data().offset(), + cast_keys.data().buffers().to_vec(), + vec![cast_values.data().clone()], + ) + }; // create the appropriate array type let new_array: ArrayRef = match **to_index_type { @@ -1648,19 +1652,21 @@ fn cast_primitive_to_list( ) }; - let list_data = ArrayData::new( - to_type.clone(), - array.len(), - Some(cast_array.null_count()), - cast_array - .data() - .null_bitmap() - .clone() - .map(|bitmap| bitmap.bits), - 0, - vec![offsets.into()], - vec![cast_array.data().clone()], - ); + let list_data = unsafe { + ArrayData::new_unchecked( + to_type.clone(), + array.len(), + Some(cast_array.null_count()), + cast_array + .data() + .null_bitmap() + .clone() + .map(|bitmap| bitmap.bits), + 0, + vec![offsets.into()], + vec![cast_array.data().clone()], + ) + }; let list_array = Arc::new(GenericListArray::::from(list_data)) as ArrayRef; @@ -1677,16 +1683,18 @@ fn cast_list_inner( let data = array.data_ref(); let underlying_array = make_array(data.child_data()[0].clone()); let cast_array = cast_with_options(&underlying_array, to.data_type(), cast_options)?; - let array_data = ArrayData::new( - to_type.clone(), - array.len(), - Some(data.null_count()), - data.null_bitmap().clone().map(|bitmap| bitmap.bits), - array.offset(), - // reuse offset buffer - data.buffers().to_vec(), - vec![cast_array.data().clone()], - ); + let array_data = unsafe { + ArrayData::new_unchecked( + to_type.clone(), + array.len(), + Some(data.null_count()), + data.null_bitmap().clone().map(|bitmap| bitmap.bits), + array.offset(), + // reuse offset buffer + data.buffers().to_vec(), + vec![cast_array.data().clone()], + ) + }; let list = GenericListArray::::from(array_data); Ok(Arc::new(list) as ArrayRef) } @@ -1735,8 +1743,11 @@ where if let Some(buf) = list_data.null_buffer() { builder = builder.null_bit_buffer(buf.clone()) } - let data = builder.build(); - Ok(Arc::new(GenericStringArray::::from(data))) + let array_data = unsafe { builder.build_unchecked() }; + + Ok(Arc::new(GenericStringArray::::from( + array_data, + ))) } /// Cast the container type of List/Largelist array but not the inner types. @@ -1811,8 +1822,8 @@ where if let Some(buf) = data.null_buffer() { builder = builder.null_bit_buffer(buf.clone()) } - let data = builder.build(); - Ok(make_array(data)) + let array_data = unsafe { builder.build_unchecked() }; + Ok(make_array(array_data)) } #[cfg(test)] @@ -2035,7 +2046,8 @@ mod tests { .len(3) .add_buffer(value_offsets) .add_child_data(value_data) - .build(); + .build() + .unwrap(); let list_array = Arc::new(ListArray::from(list_data)) as ArrayRef; let cast_array = cast( @@ -2097,7 +2109,8 @@ mod tests { .len(3) .add_buffer(value_offsets) .add_child_data(value_data) - .build(); + .build() + .unwrap(); let list_array = Arc::new(ListArray::from(list_data)) as ArrayRef; cast( @@ -3746,7 +3759,8 @@ mod tests { let value_data = ArrayData::builder(DataType::Int32) .len(8) .add_buffer(Buffer::from_slice_ref(&[0, 1, 2, 3, 4, 5, 6, 7])) - .build(); + .build() + .unwrap(); // Construct a buffer for value offsets, for the nested array: // [[0, 1, 2], [3, 4, 5], [6, 7]] @@ -3759,7 +3773,8 @@ mod tests { .len(3) .add_buffer(value_offsets) .add_child_data(value_data) - .build(); + .build() + .unwrap(); ListArray::from(list_data) } @@ -3768,7 +3783,8 @@ mod tests { let value_data = ArrayData::builder(DataType::Int32) .len(8) .add_buffer(Buffer::from_slice_ref(&[0, 1, 2, 3, 4, 5, 6, 7])) - .build(); + .build() + .unwrap(); // Construct a buffer for value offsets, for the nested array: // [[0, 1, 2], [3, 4, 5], [6, 7]] @@ -3781,7 +3797,8 @@ mod tests { .len(3) .add_buffer(value_offsets) .add_child_data(value_data) - .build(); + .build() + .unwrap(); LargeListArray::from(list_data) } @@ -3790,7 +3807,8 @@ mod tests { let value_data = ArrayData::builder(DataType::Int32) .len(10) .add_buffer(Buffer::from_slice_ref(&[0, 1, 2, 3, 4, 5, 6, 7, 8, 9])) - .build(); + .build() + .unwrap(); // Construct a fixed size list array from the above two let list_data_type = DataType::FixedSizeList( @@ -3800,7 +3818,8 @@ mod tests { let list_data = ArrayData::builder(list_data_type) .len(5) .add_child_data(value_data) - .build(); + .build() + .unwrap(); FixedSizeListArray::from(list_data) } @@ -3810,7 +3829,8 @@ mod tests { let array_data = ArrayData::builder(DataType::FixedSizeBinary(5)) .len(3) .add_buffer(Buffer::from(&values[..])) - .build(); + .build() + .unwrap(); FixedSizeBinaryArray::from(array_data) } diff --git a/arrow/src/compute/kernels/comparison.rs b/arrow/src/compute/kernels/comparison.rs index 49d0aca55093..60e275fe09e1 100644 --- a/arrow/src/compute/kernels/comparison.rs +++ b/arrow/src/compute/kernels/comparison.rs @@ -53,15 +53,17 @@ macro_rules! compare_op { // same size as $left.len() and $right.len() let buffer = unsafe { MutableBuffer::from_trusted_len_iter_bool(comparison) }; - let data = ArrayData::new( - DataType::Boolean, - $left.len(), - None, - null_bit_buffer, - 0, - vec![Buffer::from(buffer)], - vec![], - ); + let data = unsafe { + ArrayData::new_unchecked( + DataType::Boolean, + $left.len(), + None, + null_bit_buffer, + 0, + vec![Buffer::from(buffer)], + vec![], + ) + }; Ok(BooleanArray::from(data)) }}; } @@ -108,15 +110,17 @@ macro_rules! compare_op_primitive { *last |= if $op(lhs, rhs) { 1 << i } else { 0 }; }); }; - let data = ArrayData::new( - DataType::Boolean, - $left.len(), - None, - null_bit_buffer, - 0, - vec![Buffer::from(values)], - vec![], - ); + let data = unsafe { + ArrayData::new_unchecked( + DataType::Boolean, + $left.len(), + None, + null_bit_buffer, + 0, + vec![Buffer::from(values)], + vec![], + ) + }; Ok(BooleanArray::from(data)) }}; } @@ -135,15 +139,17 @@ macro_rules! compare_op_scalar { // same as $left.len() let buffer = unsafe { MutableBuffer::from_trusted_len_iter_bool(comparison) }; - let data = ArrayData::new( - DataType::Boolean, - $left.len(), - None, - null_bit_buffer, - 0, - vec![Buffer::from(buffer)], - vec![], - ); + let data = unsafe { + ArrayData::new_unchecked( + DataType::Boolean, + $left.len(), + None, + null_bit_buffer, + 0, + vec![Buffer::from(buffer)], + vec![], + ) + }; Ok(BooleanArray::from(data)) }}; } @@ -175,15 +181,17 @@ macro_rules! compare_op_scalar_primitive { }); }; - let data = ArrayData::new( - DataType::Boolean, - $left.len(), - None, - null_bit_buffer, - 0, - vec![Buffer::from(values)], - vec![], - ); + let data = unsafe { + ArrayData::new_unchecked( + DataType::Boolean, + $left.len(), + None, + null_bit_buffer, + 0, + vec![Buffer::from(values)], + vec![], + ) + }; Ok(BooleanArray::from(data)) }}; } @@ -270,15 +278,17 @@ pub fn like_utf8( result.append(re.is_match(haystack)); } - let data = ArrayData::new( - DataType::Boolean, - left.len(), - None, - null_bit_buffer, - 0, - vec![result.finish()], - vec![], - ); + let data = unsafe { + ArrayData::new_unchecked( + DataType::Boolean, + left.len(), + None, + null_bit_buffer, + 0, + vec![result.finish()], + vec![], + ) + }; Ok(BooleanArray::from(data)) } @@ -340,15 +350,17 @@ pub fn like_utf8_scalar( } }; - let data = ArrayData::new( - DataType::Boolean, - left.len(), - None, - null_bit_buffer, - 0, - vec![bool_buf.into()], - vec![], - ); + let data = unsafe { + ArrayData::new_unchecked( + DataType::Boolean, + left.len(), + None, + null_bit_buffer, + 0, + vec![bool_buf.into()], + vec![], + ) + }; Ok(BooleanArray::from(data)) } @@ -392,15 +404,17 @@ pub fn nlike_utf8( result.append(!re.is_match(haystack)); } - let data = ArrayData::new( - DataType::Boolean, - left.len(), - None, - null_bit_buffer, - 0, - vec![result.finish()], - vec![], - ); + let data = unsafe { + ArrayData::new_unchecked( + DataType::Boolean, + left.len(), + None, + null_bit_buffer, + 0, + vec![result.finish()], + vec![], + ) + }; Ok(BooleanArray::from(data)) } @@ -445,15 +459,17 @@ pub fn nlike_utf8_scalar( } } - let data = ArrayData::new( - DataType::Boolean, - left.len(), - None, - null_bit_buffer, - 0, - vec![result.finish()], - vec![], - ); + let data = unsafe { + ArrayData::new_unchecked( + DataType::Boolean, + left.len(), + None, + null_bit_buffer, + 0, + vec![result.finish()], + vec![], + ) + }; Ok(BooleanArray::from(data)) } @@ -530,15 +546,17 @@ pub fn regexp_is_match_utf8( }) .collect::>>()?; - let data = ArrayData::new( - DataType::Boolean, - array.len(), - None, - null_bit_buffer, - 0, - vec![result.finish()], - vec![], - ); + let data = unsafe { + ArrayData::new_unchecked( + DataType::Boolean, + array.len(), + None, + null_bit_buffer, + 0, + vec![result.finish()], + vec![], + ) + }; Ok(BooleanArray::from(data)) } @@ -575,15 +593,17 @@ pub fn regexp_is_match_utf8_scalar( } } - let data = ArrayData::new( - DataType::Boolean, - array.len(), - None, - null_bit_buffer, - 0, - vec![result.finish()], - vec![], - ); + let data = unsafe { + ArrayData::new_unchecked( + DataType::Boolean, + array.len(), + None, + null_bit_buffer, + 0, + vec![result.finish()], + vec![], + ) + }; Ok(BooleanArray::from(data)) } @@ -1046,15 +1066,17 @@ where } } - let data = ArrayData::new( - DataType::Boolean, - left.len(), - None, - None, - 0, - vec![bool_buf.into()], - vec![], - ); + let data = unsafe { + ArrayData::new_unchecked( + DataType::Boolean, + left.len(), + None, + None, + 0, + vec![bool_buf.into()], + vec![], + ) + }; Ok(BooleanArray::from(data)) } @@ -1104,15 +1126,17 @@ where } } - let data = ArrayData::new( - DataType::Boolean, - left.len(), - None, - None, - 0, - vec![bool_buf.into()], - vec![], - ); + let data = unsafe { + ArrayData::new_unchecked( + DataType::Boolean, + left.len(), + None, + None, + 0, + vec![bool_buf.into()], + vec![], + ) + }; Ok(BooleanArray::from(data)) } @@ -1453,7 +1477,8 @@ mod tests { .add_buffer(value_offsets) .add_child_data(value_data) .null_bit_buffer(Buffer::from([0b00001011])) - .build(); + .build() + .unwrap(); // [[0, 1, 2], [3, 4, 5], null, [6, null, 7]] let list_array = LargeListArray::from(list_data); diff --git a/arrow/src/compute/kernels/filter.rs b/arrow/src/compute/kernels/filter.rs index 55b1cd1ceee7..61a73d0d64bf 100644 --- a/arrow/src/compute/kernels/filter.rs +++ b/arrow/src/compute/kernels/filter.rs @@ -224,8 +224,10 @@ pub fn prep_null_mask_filter(filter: &BooleanArray) -> BooleanArray { let array_data = ArrayData::builder(DataType::Boolean) .len(filter.len()) - .add_buffer(new_mask) - .build(); + .add_buffer(new_mask); + + let array_data = unsafe { array_data.build_unchecked() }; + BooleanArray::from(array_data) } @@ -566,7 +568,8 @@ mod tests { let value_data = ArrayData::builder(DataType::Int32) .len(8) .add_buffer(Buffer::from_slice_ref(&[0, 1, 2, 3, 4, 5, 6, 7])) - .build(); + .build() + .unwrap(); let value_offsets = Buffer::from_slice_ref(&[0i64, 3, 6, 8, 8]); @@ -577,7 +580,8 @@ mod tests { .add_buffer(value_offsets) .add_child_data(value_data) .null_bit_buffer(Buffer::from([0b00000111])) - .build(); + .build() + .unwrap(); // a = [[0, 1, 2], [3, 4, 5], [6, 7], null] let a = LargeListArray::from(list_data); @@ -588,7 +592,8 @@ mod tests { let value_data = ArrayData::builder(DataType::Int32) .len(3) .add_buffer(Buffer::from_slice_ref(&[3, 4, 5])) - .build(); + .build() + .unwrap(); let value_offsets = Buffer::from_slice_ref(&[0i64, 3, 3]); @@ -599,7 +604,8 @@ mod tests { .add_buffer(value_offsets) .add_child_data(value_data) .null_bit_buffer(Buffer::from([0b00000001])) - .build(); + .build() + .unwrap(); assert_eq!(&make_array(expected), &result); } diff --git a/arrow/src/compute/kernels/length.rs b/arrow/src/compute/kernels/length.rs index fb76d000076e..b0f3d9ad58ef 100644 --- a/arrow/src/compute/kernels/length.rs +++ b/arrow/src/compute/kernels/length.rs @@ -56,15 +56,17 @@ where .null_buffer() .map(|b| b.bit_slice(array.offset(), array.len())); - let data = ArrayData::new( - data_type, - array.len(), - None, - null_bit_buffer, - 0, - vec![buffer], - vec![], - ); + let data = unsafe { + ArrayData::new_unchecked( + data_type, + array.len(), + None, + null_bit_buffer, + 0, + vec![buffer], + vec![], + ) + }; make_array(data) } diff --git a/arrow/src/compute/kernels/limit.rs b/arrow/src/compute/kernels/limit.rs index cafbbf0ab9b7..34f5dcafc3d0 100644 --- a/arrow/src/compute/kernels/limit.rs +++ b/arrow/src/compute/kernels/limit.rs @@ -92,7 +92,8 @@ mod tests { let value_data = ArrayData::builder(DataType::Int32) .len(10) .add_buffer(Buffer::from_slice_ref(&[0, 1, 2, 3, 4, 5, 6, 7, 8, 9])) - .build(); + .build() + .unwrap(); // Construct a buffer for value offsets, for the nested array: // [[0, 1], null, [2, 3], null, [4, 5], null, [6, 7, 8], null, [9]] @@ -113,7 +114,8 @@ mod tests { .add_buffer(value_offsets) .add_child_data(value_data) .null_bit_buffer(Buffer::from(null_bits)) - .build(); + .build() + .unwrap(); let list_array: ArrayRef = Arc::new(ListArray::from(list_data)); let limit_array = limit(&list_array, 6); @@ -144,12 +146,14 @@ mod tests { .len(5) .add_buffer(Buffer::from([0b00010000])) .null_bit_buffer(Buffer::from([0b00010001])) - .build(); + .build() + .unwrap(); let int_data = ArrayData::builder(DataType::Int32) .len(5) .add_buffer(Buffer::from_slice_ref(&[0, 28, 42, 0, 0])) .null_bit_buffer(Buffer::from([0b00000110])) - .build(); + .build() + .unwrap(); let mut field_types = vec![]; field_types.push(Field::new("a", DataType::Boolean, false)); @@ -159,7 +163,8 @@ mod tests { .add_child_data(boolean_data.clone()) .add_child_data(int_data.clone()) .null_bit_buffer(Buffer::from([0b00010111])) - .build(); + .build() + .unwrap(); let struct_array = StructArray::from(struct_array_data); assert_eq!(5, struct_array.len()); diff --git a/arrow/src/compute/kernels/sort.rs b/arrow/src/compute/kernels/sort.rs index 6f42be34aa44..88c7785bc985 100644 --- a/arrow/src/compute/kernels/sort.rs +++ b/arrow/src/compute/kernels/sort.rs @@ -488,15 +488,17 @@ fn sort_boolean( } } - let result_data = ArrayData::new( - DataType::UInt32, - len, - Some(0), - None, - 0, - vec![result.into()], - vec![], - ); + let result_data = unsafe { + ArrayData::new_unchecked( + DataType::UInt32, + len, + Some(0), + None, + 0, + vec![result.into()], + vec![], + ) + }; UInt32Array::from(result_data) } @@ -574,15 +576,17 @@ where } } - let result_data = ArrayData::new( - DataType::UInt32, - len, - Some(0), - None, - 0, - vec![result.into()], - vec![], - ); + let result_data = unsafe { + ArrayData::new_unchecked( + DataType::UInt32, + len, + Some(0), + None, + 0, + vec![result.into()], + vec![], + ) + }; UInt32Array::from(result_data) } diff --git a/arrow/src/compute/kernels/substring.rs b/arrow/src/compute/kernels/substring.rs index d4ea6616c648..01fdf640bdae 100644 --- a/arrow/src/compute/kernels/substring.rs +++ b/arrow/src/compute/kernels/substring.rs @@ -74,18 +74,20 @@ fn generic_substring( new_values.extend_from_slice(&data[start..start + length]); }); - let data = ArrayData::new( - ::DATA_TYPE, - array.len(), - None, - null_bit_buffer, - 0, - vec![ - Buffer::from_slice_ref(&new_offsets), - Buffer::from_slice_ref(&new_values), - ], - vec![], - ); + let data = unsafe { + ArrayData::new_unchecked( + ::DATA_TYPE, + array.len(), + None, + null_bit_buffer, + 0, + vec![ + Buffer::from_slice_ref(&new_offsets), + Buffer::from_slice_ref(&new_values), + ], + vec![], + ) + }; Ok(make_array(data)) } diff --git a/arrow/src/compute/kernels/take.rs b/arrow/src/compute/kernels/take.rs index 71479723e022..692de278974d 100644 --- a/arrow/src/compute/kernels/take.rs +++ b/arrow/src/compute/kernels/take.rs @@ -523,15 +523,17 @@ where } }; - let data = ArrayData::new( - T::DATA_TYPE, - indices.len(), - None, - nulls, - 0, - vec![buffer], - vec![], - ); + let data = unsafe { + ArrayData::new_unchecked( + T::DATA_TYPE, + indices.len(), + None, + nulls, + 0, + vec![buffer], + vec![], + ) + }; Ok(PrimitiveArray::::from(data)) } @@ -598,15 +600,17 @@ where }; } - let data = ArrayData::new( - DataType::Boolean, - indices.len(), - None, - nulls, - 0, - vec![val_buf.into()], - vec![], - ); + let data = unsafe { + ArrayData::new_unchecked( + DataType::Boolean, + indices.len(), + None, + nulls, + 0, + vec![val_buf.into()], + vec![], + ) + }; Ok(BooleanArray::from(data)) } @@ -713,14 +717,17 @@ where }; } - let mut data = ArrayData::builder(::DATA_TYPE) - .len(data_len) - .add_buffer(offsets_buffer.into()) - .add_buffer(values.into()); + let mut array_data = + ArrayData::builder(::DATA_TYPE) + .len(data_len) + .add_buffer(offsets_buffer.into()) + .add_buffer(values.into()); if let Some(null_buffer) = nulls { - data = data.null_bit_buffer(null_buffer); + array_data = array_data.null_bit_buffer(null_buffer); } - Ok(GenericStringArray::::from(data.build())) + let array_data = unsafe { array_data.build_unchecked() }; + + Ok(GenericStringArray::::from(array_data)) } /// `take` implementation for list arrays @@ -768,8 +775,10 @@ where .null_bit_buffer(null_buf.into()) .offset(0) .add_child_data(taken.data().clone()) - .add_buffer(value_offsets) - .build(); + .add_buffer(value_offsets); + + let list_data = unsafe { list_data.build_unchecked() }; + Ok(GenericListArray::::from(list_data)) } @@ -808,8 +817,9 @@ where .len(indices.len()) .null_bit_buffer(null_buf.into()) .offset(0) - .add_child_data(taken.data().clone()) - .build(); + .add_child_data(taken.data().clone()); + + let list_data = unsafe { list_data.build_unchecked() }; Ok(FixedSizeListArray::from(list_data)) } @@ -884,15 +894,17 @@ where let new_keys = take_primitive::(values.keys(), indices)?; let new_keys_data = new_keys.data_ref(); - let data = ArrayData::new( - values.data_type().clone(), - new_keys.len(), - Some(new_keys_data.null_count()), - new_keys_data.null_buffer().cloned(), - 0, - new_keys_data.buffers().to_vec(), - values.data().child_data().to_vec(), - ); + let data = unsafe { + ArrayData::new_unchecked( + values.data_type().clone(), + new_keys.len(), + Some(new_keys_data.null_count()), + new_keys_data.null_buffer().cloned(), + 0, + new_keys_data.buffers().to_vec(), + values.data().child_data().to_vec(), + ) + }; Ok(DictionaryArray::::from(data)) } @@ -1383,7 +1395,8 @@ mod tests { .len(3) .add_buffer(value_offsets) .add_child_data(value_data) - .build(); + .build() + .unwrap(); let list_array = $list_array_type::from(list_data); // index returns: [[2,3], null, [-1,-2,-1], [2,3], [0,0,0]] @@ -1421,7 +1434,8 @@ mod tests { ) .add_buffer(expected_offsets) .add_child_data(expected_data) - .build(); + .build() + .unwrap(); let expected_list_array = $list_array_type::from(expected_list_data); assert_eq!(a, &expected_list_array); @@ -1458,7 +1472,8 @@ mod tests { .add_buffer(value_offsets) .null_bit_buffer(Buffer::from([0b10111101, 0b00000000])) .add_child_data(value_data) - .build(); + .build() + .unwrap(); let list_array = $list_array_type::from(list_data); // index returns: [[null], null, [-1,-2,3], [2,null], [0,null,0]] @@ -1495,7 +1510,8 @@ mod tests { ) .add_buffer(expected_offsets) .add_child_data(expected_data) - .build(); + .build() + .unwrap(); let expected_list_array = $list_array_type::from(expected_list_data); assert_eq!(a, &expected_list_array); @@ -1531,7 +1547,8 @@ mod tests { .add_buffer(value_offsets) .null_bit_buffer(Buffer::from([0b01111101])) .add_child_data(value_data) - .build(); + .build() + .unwrap(); let list_array = $list_array_type::from(list_data); // index returns: [null, null, [-1,-2,3], [5,null], [0,null,0]] @@ -1569,7 +1586,8 @@ mod tests { .null_bit_buffer(Buffer::from(null_bits)) .add_buffer(expected_offsets) .add_child_data(expected_data) - .build(); + .build() + .unwrap(); let expected_list_array = $list_array_type::from(expected_list_data); assert_eq!(a, &expected_list_array); @@ -1698,7 +1716,8 @@ mod tests { .len(3) .add_buffer(value_offsets) .add_child_data(value_data) - .build(); + .build() + .unwrap(); let list_array = ListArray::from(list_data); let index = UInt32Array::from(vec![1000]); diff --git a/arrow/src/compute/util.rs b/arrow/src/compute/util.rs index 6d4d0e40a9b4..f4ddbaf56d1e 100644 --- a/arrow/src/compute/util.rs +++ b/arrow/src/compute/util.rs @@ -185,15 +185,18 @@ pub(super) mod tests { null_bit_buffer: Option, ) -> Arc { // empty vec for buffers and children is not really correct, but for these tests we only care about the null bitmap - Arc::new(ArrayData::new( - DataType::UInt8, - len, - None, - null_bit_buffer, - offset, - vec![], - vec![], - )) + Arc::new( + ArrayData::try_new( + DataType::UInt8, + len, + None, + null_bit_buffer, + offset, + vec![], + vec![], + ) + .unwrap(), + ) } #[test] @@ -333,7 +336,8 @@ pub(super) mod tests { .null_bit_buffer(list_bitmap.into()) .add_buffer(value_offsets) .add_child_data(value_data) - .build(); + .build() + .unwrap(); GenericListArray::::from(list_data) } @@ -397,7 +401,8 @@ pub(super) mod tests { .len(list_len) .null_bit_buffer(list_bitmap.into()) .add_child_data(child_data) - .build(); + .build() + .unwrap(); FixedSizeListArray::from(list_data) } diff --git a/arrow/src/ffi.rs b/arrow/src/ffi.rs index 36d7f26f3f20..a61f291bd4ab 100644 --- a/arrow/src/ffi.rs +++ b/arrow/src/ffi.rs @@ -514,15 +514,18 @@ pub trait ArrowArrayRef { .map(|d| d.unwrap()) .collect(); - Ok(ArrayData::new( - data_type, - len, - Some(null_count), - null_bit_buffer, - offset, - buffers, - child_data, - )) + // Should FFI be checking validity? + Ok(unsafe { + ArrayData::new_unchecked( + data_type, + len, + Some(null_count), + null_bit_buffer, + offset, + buffers, + child_data, + ) + }) } /// returns all buffers, as organized by Rust (i.e. null buffer is skipped) @@ -862,7 +865,8 @@ mod tests { let value_data = ArrayData::builder(DataType::Int32) .len(8) .add_buffer(Buffer::from_slice_ref(&[0, 1, 2, 3, 4, 5, 6, 7])) - .build(); + .build() + .unwrap(); // Construct a buffer for value offsets, for the nested array: // [[0, 1, 2], [3, 4, 5], [6, 7]] @@ -883,7 +887,8 @@ mod tests { .len(3) .add_buffer(value_offsets) .add_child_data(value_data) - .build(); + .build() + .unwrap(); // create an array natively let array = GenericListArray::::from(list_data.clone()); diff --git a/arrow/src/ipc/reader.rs b/arrow/src/ipc/reader.rs index 50e858f098a8..088e9add9aa2 100644 --- a/arrow/src/ipc/reader.rs +++ b/arrow/src/ipc/reader.rs @@ -189,7 +189,8 @@ fn create_array( let data = ArrayData::builder(data_type.clone()) .len(length) .offset(0) - .build(); + .build() + .unwrap(); node_index += 1; // no buffer increases make_array(data) @@ -230,7 +231,7 @@ fn create_primitive_array( if null_count > 0 { builder = builder.null_bit_buffer(buffers[0].clone()) } - builder.build() + builder.build().unwrap() } FixedSizeBinary(_) => { // read 3 buffers @@ -241,7 +242,7 @@ fn create_primitive_array( if null_count > 0 { builder = builder.null_bit_buffer(buffers[0].clone()) } - builder.build() + unsafe { builder.build_unchecked() } } Int8 | Int16 @@ -261,7 +262,8 @@ fn create_primitive_array( if null_count > 0 { builder = builder.null_bit_buffer(buffers[0].clone()) } - let values = Arc::new(Int64Array::from(builder.build())) as ArrayRef; + let data = unsafe { builder.build_unchecked() }; + let values = Arc::new(Int64Array::from(data)) as ArrayRef; // this cast is infallible, the unwrap is safe let casted = cast(&values, data_type).unwrap(); casted.data().clone() @@ -273,7 +275,7 @@ fn create_primitive_array( if null_count > 0 { builder = builder.null_bit_buffer(buffers[0].clone()) } - builder.build() + unsafe { builder.build_unchecked() } } } Float32 => { @@ -286,7 +288,8 @@ fn create_primitive_array( if null_count > 0 { builder = builder.null_bit_buffer(buffers[0].clone()) } - let values = Arc::new(Float64Array::from(builder.build())) as ArrayRef; + let data = unsafe { builder.build_unchecked() }; + let values = Arc::new(Float64Array::from(data)) as ArrayRef; // this cast is infallible, the unwrap is safe let casted = cast(&values, data_type).unwrap(); casted.data().clone() @@ -298,7 +301,7 @@ fn create_primitive_array( if null_count > 0 { builder = builder.null_bit_buffer(buffers[0].clone()) } - builder.build() + unsafe { builder.build_unchecked() } } } Boolean @@ -317,7 +320,7 @@ fn create_primitive_array( if null_count > 0 { builder = builder.null_bit_buffer(buffers[0].clone()) } - builder.build() + unsafe { builder.build_unchecked() } } Decimal(_, _) => { // read 3 buffers @@ -328,7 +331,7 @@ fn create_primitive_array( if null_count > 0 { builder = builder.null_bit_buffer(buffers[0].clone()) } - builder.build() + unsafe { builder.build_unchecked() } } t => panic!("Data type {:?} either unsupported or not primitive", t), }; @@ -354,7 +357,7 @@ fn create_list_array( if null_count > 0 { builder = builder.null_bit_buffer(buffers[0].clone()) } - make_array(builder.build()) + make_array(unsafe { builder.build_unchecked() }) } else if let DataType::LargeList(_) = *data_type { let null_count = field_node.null_count() as usize; let mut builder = ArrayData::builder(data_type.clone()) @@ -365,7 +368,7 @@ fn create_list_array( if null_count > 0 { builder = builder.null_bit_buffer(buffers[0].clone()) } - make_array(builder.build()) + make_array(unsafe { builder.build_unchecked() }) } else if let DataType::FixedSizeList(_, _) = *data_type { let null_count = field_node.null_count() as usize; let mut builder = ArrayData::builder(data_type.clone()) @@ -376,7 +379,7 @@ fn create_list_array( if null_count > 0 { builder = builder.null_bit_buffer(buffers[0].clone()) } - make_array(builder.build()) + make_array(unsafe { builder.build_unchecked() }) } else if let DataType::Map(_, _) = *data_type { let null_count = field_node.null_count() as usize; let mut builder = ArrayData::builder(data_type.clone()) @@ -387,7 +390,7 @@ fn create_list_array( if null_count > 0 { builder = builder.null_bit_buffer(buffers[0].clone()) } - make_array(builder.build()) + make_array(unsafe { builder.build_unchecked() }) } else { panic!("Cannot create list or map array from {:?}", data_type) } @@ -411,7 +414,7 @@ fn create_dictionary_array( if null_count > 0 { builder = builder.null_bit_buffer(buffers[0].clone()) } - make_array(builder.build()) + make_array(unsafe { builder.build_unchecked() }) } else { unreachable!("Cannot create dictionary array from {:?}", data_type) } diff --git a/arrow/src/json/reader.rs b/arrow/src/json/reader.rs index c2a2de924e4d..eb78e0a420fc 100644 --- a/arrow/src/json/reader.rs +++ b/arrow/src/json/reader.rs @@ -998,11 +998,13 @@ impl Decoder { }); } }); - ArrayData::builder(list_field.data_type().clone()) - .len(valid_len) - .add_buffer(bool_values.into()) - .null_bit_buffer(bool_nulls.into()) - .build() + unsafe { + ArrayData::builder(list_field.data_type().clone()) + .len(valid_len) + .add_buffer(bool_values.into()) + .null_bit_buffer(bool_nulls.into()) + .build_unchecked() + } } DataType::Int8 => self.read_primitive_list_values::(rows), DataType::Int16 => self.read_primitive_list_values::(rows), @@ -1076,11 +1078,15 @@ impl Decoder { self.build_struct_array(rows.as_slice(), fields.as_slice(), &[])?; let data_type = DataType::Struct(fields.clone()); let buf = null_buffer.into(); - ArrayDataBuilder::new(data_type) - .len(rows.len()) - .null_bit_buffer(buf) - .child_data(arrays.into_iter().map(|a| a.data().clone()).collect()) - .build() + unsafe { + ArrayDataBuilder::new(data_type) + .len(rows.len()) + .null_bit_buffer(buf) + .child_data( + arrays.into_iter().map(|a| a.data().clone()).collect(), + ) + .build_unchecked() + } } datatype => { return Err(ArrowError::JsonError(format!( @@ -1094,8 +1100,8 @@ impl Decoder { .len(list_len) .add_buffer(Buffer::from_slice_ref(&offsets)) .add_child_data(array_data) - .null_bit_buffer(list_nulls.into()) - .build(); + .null_bit_buffer(list_nulls.into()); + let list_data = unsafe { list_data.build_unchecked() }; Ok(Arc::new(GenericListArray::::from(list_data))) } @@ -1291,8 +1297,8 @@ impl Decoder { .null_bit_buffer(null_buffer.into()) .child_data( arrays.into_iter().map(|a| a.data().clone()).collect(), - ) - .build(); + ); + let data = unsafe { data.build_unchecked() }; Ok(make_array(data)) } DataType::Map(map_field, _) => self.build_map_array( @@ -1384,26 +1390,28 @@ impl Decoder { &[], )?; - Ok(make_array(ArrayData::new( - map_type.clone(), - rows_len, - None, - Some(list_bitmap.into()), - 0, - vec![Buffer::from_slice_ref(&list_offsets)], - vec![ArrayData::new( - struct_field.data_type().clone(), - struct_children[0].len(), - None, + unsafe { + Ok(make_array(ArrayData::new_unchecked( + map_type.clone(), + rows_len, None, + Some(list_bitmap.into()), 0, - vec![], - struct_children - .into_iter() - .map(|array| array.data().clone()) - .collect(), - )], - ))) + vec![Buffer::from_slice_ref(&list_offsets)], + vec![ArrayData::new_unchecked( + struct_field.data_type().clone(), + struct_children[0].len(), + None, + None, + 0, + vec![], + struct_children + .into_iter() + .map(|array| array.data().clone()) + .collect(), + )], + ))) + } } #[inline(always)] @@ -2159,14 +2167,16 @@ mod tests { .len(4) .add_child_data(d.data().clone()) .null_bit_buffer(Buffer::from(vec![0b00000101])) - .build(); + .build() + .unwrap(); let b = BooleanArray::from(vec![Some(true), Some(false), Some(true), None]); let a = ArrayDataBuilder::new(a_field.data_type().clone()) .len(4) .add_child_data(b.data().clone()) .add_child_data(c) .null_bit_buffer(Buffer::from(vec![0b00000111])) - .build(); + .build() + .unwrap(); let expected = make_array(a); // compare `a` with result from json reader @@ -2223,7 +2233,8 @@ mod tests { .len(7) .add_child_data(d.data().clone()) .null_bit_buffer(Buffer::from(vec![0b00111011])) - .build(); + .build() + .unwrap(); let b = BooleanArray::from(vec![ Some(true), Some(false), @@ -2238,13 +2249,15 @@ mod tests { .add_child_data(b.data().clone()) .add_child_data(c.clone()) .null_bit_buffer(Buffer::from(vec![0b00111111])) - .build(); + .build() + .unwrap(); let a_list = ArrayDataBuilder::new(a_field.data_type().clone()) .len(6) .add_buffer(Buffer::from_slice_ref(&[0i32, 2, 3, 6, 6, 6, 7])) .add_child_data(a) .null_bit_buffer(Buffer::from(vec![0b00110111])) - .build(); + .build() + .unwrap(); let expected = make_array(a_list); // compare `a` with result from json reader @@ -2342,18 +2355,21 @@ mod tests { )) .add_child_data(expected_value_array_data) .null_bit_buffer(Buffer::from(vec![0b01010111])) - .build(); + .build() + .unwrap(); let expected_stocks_entries_data = ArrayDataBuilder::new(entries_struct_type) .len(7) .add_child_data(expected_keys) .add_child_data(expected_values) - .build(); + .build() + .unwrap(); let expected_stocks_data = ArrayDataBuilder::new(stocks_field.data_type().clone()) .len(3) .add_buffer(Buffer::from(vec![0i32, 2, 4, 7].to_byte_slice())) .add_child_data(expected_stocks_entries_data) - .build(); + .build() + .unwrap(); let expected_stocks = make_array(expected_stocks_data); diff --git a/arrow/src/json/writer.rs b/arrow/src/json/writer.rs index 52ef9459553d..4279ab786fa0 100644 --- a/arrow/src/json/writer.rs +++ b/arrow/src/json/writer.rs @@ -1028,7 +1028,8 @@ mod tests { .add_buffer(a_value_offsets) .add_child_data(a_values.data().clone()) .null_bit_buffer(Buffer::from(vec![0b00011111])) - .build(); + .build() + .unwrap(); let a = ListArray::from(a_list_data); let b = Int32Array::from(vec![1, 2, 3, 4, 5]); @@ -1079,14 +1080,16 @@ mod tests { .add_buffer(a_value_offsets) .null_bit_buffer(Buffer::from(vec![0b00000111])) .add_child_data(a_values.data().clone()) - .build(); + .build() + .unwrap(); let c1_value_offsets = Buffer::from(&[0, 2, 2, 3].to_byte_slice()); let c1_list_data = ArrayData::builder(field_c1.data_type().clone()) .len(3) .add_buffer(c1_value_offsets) .add_child_data(a_list_data) - .build(); + .build() + .unwrap(); let c1 = ListArray::from(c1_list_data); let c2 = StringArray::from(vec![Some("foo"), Some("bar"), None]); @@ -1160,7 +1163,8 @@ mod tests { .add_buffer(c1_value_offsets) .add_child_data(struct_values.data().clone()) .null_bit_buffer(Buffer::from(vec![0b00000101])) - .build(); + .build() + .unwrap(); let c1 = ListArray::from(c1_list_data); let c2 = Int32Array::from(vec![1, 2, 3]); diff --git a/arrow/src/record_batch.rs b/arrow/src/record_batch.rs index b6e5566495a2..b441f6cf295e 100644 --- a/arrow/src/record_batch.rs +++ b/arrow/src/record_batch.rs @@ -608,7 +608,8 @@ mod tests { .add_child_data(a2_child.data().clone()) .len(2) .add_buffer(Buffer::from(vec![0i32, 3, 4].to_byte_slice())) - .build(); + .build() + .unwrap(); let a2: ArrayRef = Arc::new(ListArray::from(a2)); let a = ArrayDataBuilder::new(DataType::Struct(vec![ Field::new("aa1", DataType::Int32, false), @@ -617,7 +618,8 @@ mod tests { .add_child_data(a1.data().clone()) .add_child_data(a2.data().clone()) .len(2) - .build(); + .build() + .unwrap(); let a: ArrayRef = Arc::new(StructArray::from(a)); // creating the batch with field name validation should fail diff --git a/arrow/src/util/data_gen.rs b/arrow/src/util/data_gen.rs index 08624ae86ee8..35b65ef303db 100644 --- a/arrow/src/util/data_gen.rs +++ b/arrow/src/util/data_gen.rs @@ -192,15 +192,17 @@ fn create_random_list_array( true => Some(create_random_null_buffer(size, null_density)), false => None, }; - let list_data = ArrayData::new( - field.data_type().clone(), - size, - None, - null_buffer, - 0, - vec![offsets], - vec![child_data.clone()], - ); + let list_data = unsafe { + ArrayData::new_unchecked( + field.data_type().clone(), + size, + None, + null_buffer, + 0, + vec![offsets], + vec![child_data.clone()], + ) + }; Ok(make_array(list_data)) } diff --git a/arrow/src/util/integration_util.rs b/arrow/src/util/integration_util.rs index ada2494d3c2d..1a402bc6e368 100644 --- a/arrow/src/util/integration_util.rs +++ b/arrow/src/util/integration_util.rs @@ -944,7 +944,8 @@ mod tests { .len(3) .add_buffer(value_offsets) .add_child_data(value_data.data().clone()) - .build(); + .build() + .unwrap(); let lists = ListArray::from(list_data); let structs_int32s = Int32Array::from(vec![None, Some(-2), None]); diff --git a/integration-testing/src/lib.rs b/integration-testing/src/lib.rs index 6db3fce91e0d..f25157f635bc 100644 --- a/integration-testing/src/lib.rs +++ b/integration-testing/src/lib.rs @@ -421,7 +421,8 @@ fn array_from_json( .add_buffer(Buffer::from(&offsets.to_byte_slice())) .add_child_data(child_array.data().clone()) .null_bit_buffer(null_buf) - .build(); + .build() + .unwrap(); Ok(Arc::new(ListArray::from(list_data))) } DataType::LargeList(child_field) => { @@ -448,7 +449,8 @@ fn array_from_json( .add_buffer(Buffer::from(&offsets.to_byte_slice())) .add_child_data(child_array.data().clone()) .null_bit_buffer(null_buf) - .build(); + .build() + .unwrap(); Ok(Arc::new(LargeListArray::from(list_data))) } DataType::FixedSizeList(child_field, _) => { @@ -463,7 +465,8 @@ fn array_from_json( .len(json_col.count) .add_child_data(child_array.data().clone()) .null_bit_buffer(null_buf) - .build(); + .build() + .unwrap(); Ok(Arc::new(FixedSizeListArray::from(list_data))) } DataType::Struct(fields) => { @@ -478,7 +481,7 @@ fn array_from_json( array_data = array_data.add_child_data(array.data().clone()); } - let array = StructArray::from(array_data.build()); + let array = StructArray::from(array_data.build().unwrap()); Ok(Arc::new(array)) } DataType::Dictionary(key_type, value_type) => { @@ -557,7 +560,8 @@ fn dictionary_array_from_json( .add_buffer(keys.data().buffers()[0].clone()) .null_bit_buffer(null_buf) .add_child_data(values.data().clone()) - .build(); + .build() + .unwrap(); let array = match dict_key { DataType::Int8 => { diff --git a/parquet/src/arrow/array_reader.rs b/parquet/src/arrow/array_reader.rs index d3259c46bbad..ae001ed73391 100644 --- a/parquet/src/arrow/array_reader.rs +++ b/parquet/src/arrow/array_reader.rs @@ -324,25 +324,20 @@ impl ArrayReader for PrimitiveArrayReader { array_data = array_data.null_bit_buffer(b); } + let array_data = unsafe { array_data.build_unchecked() }; let array = match T::get_physical_type() { - PhysicalType::BOOLEAN => { - Arc::new(BooleanArray::from(array_data.build())) as ArrayRef - } + PhysicalType::BOOLEAN => Arc::new(BooleanArray::from(array_data)) as ArrayRef, PhysicalType::INT32 => { - Arc::new(PrimitiveArray::::from(array_data.build())) - as ArrayRef + Arc::new(PrimitiveArray::::from(array_data)) as ArrayRef } PhysicalType::INT64 => { - Arc::new(PrimitiveArray::::from(array_data.build())) - as ArrayRef + Arc::new(PrimitiveArray::::from(array_data)) as ArrayRef } PhysicalType::FLOAT => { - Arc::new(PrimitiveArray::::from(array_data.build())) - as ArrayRef + Arc::new(PrimitiveArray::::from(array_data)) as ArrayRef } PhysicalType::DOUBLE => { - Arc::new(PrimitiveArray::::from(array_data.build())) - as ArrayRef + Arc::new(PrimitiveArray::::from(array_data)) as ArrayRef } PhysicalType::INT96 | PhysicalType::BYTE_ARRAY @@ -904,8 +899,9 @@ impl ArrayReader for ListArrayReader { .add_buffer(value_offsets) .add_child_data(batch_values.data().clone()) .null_bit_buffer(null_buf.into()) - .offset(next_batch_array.offset()) - .build(); + .offset(next_batch_array.offset()); + + let list_data = unsafe { list_data.build_unchecked() }; let result_array = GenericListArray::::from(list_data); Ok(Arc::new(result_array)) @@ -1000,8 +996,8 @@ impl ArrayReader for MapArrayReader { let entry_data = ArrayDataBuilder::new(entry_data_type) .len(key_length) .add_child_data(key_array.data().clone()) - .add_child_data(value_array.data().clone()) - .build(); + .add_child_data(value_array.data().clone()); + let entry_data = unsafe { entry_data.build_unchecked() }; let entry_len = rep_levels.iter().filter(|level| **level == 0).count(); @@ -1044,8 +1040,9 @@ impl ArrayReader for MapArrayReader { .len(entry_len) .add_buffer(value_offsets) .null_bit_buffer(null_buf.into()) - .add_child_data(entry_data) - .build(); + .add_child_data(entry_data); + + let array_data = unsafe { array_data.build_unchecked() }; Ok(Arc::new(MapArray::from(array_data))) } @@ -1192,8 +1189,8 @@ impl ArrayReader for StructArrayReader { .iter() .map(|x| x.data().clone()) .collect::>(), - ) - .build(); + ); + let array_data = unsafe { array_data.build_unchecked() }; // calculate struct rep level data, since struct doesn't add to repetition // levels, here we just need to keep repetition levels of first array diff --git a/parquet/src/arrow/arrow_array_reader.rs b/parquet/src/arrow/arrow_array_reader.rs index 04de2d4d2592..3f2acf4568d7 100644 --- a/parquet/src/arrow/arrow_array_reader.rs +++ b/parquet/src/arrow/arrow_array_reader.rs @@ -621,10 +621,12 @@ impl ArrayReader for ArrowArrayReader<'static, C> { mutable.extend_nulls(nulls_to_add); } - value_array_data = mutable - .into_builder() - .null_bit_buffer(null_bitmap_array.values().clone()) - .build(); + value_array_data = unsafe { + mutable + .into_builder() + .null_bit_buffer(null_bitmap_array.values().clone()) + .build_unchecked() + }; } let mut array = arrow::array::make_array(value_array_data); if array.data_type() != &self.data_type { @@ -1144,8 +1146,8 @@ impl ArrayConverter for PrimitiveArrayConverter { let value_count = values_buffer.len() / value_size; let array_data = arrow::array::ArrayData::builder(T::DATA_TYPE) .len(value_count) - .add_buffer(values_buffer.into()) - .build(); + .add_buffer(values_buffer.into()); + let array_data = unsafe { array_data.build_unchecked() }; Ok(array_data) } } @@ -1192,8 +1194,8 @@ impl ArrayConverter for StringArrayConverter { let array_data = arrow::array::ArrayData::builder(ArrowType::Utf8) .len(data_len) .add_buffer(offsets_buffer.into()) - .add_buffer(values_buffer.into()) - .build(); + .add_buffer(values_buffer.into()); + let array_data = unsafe { array_data.build_unchecked() }; Ok(array_data) } } diff --git a/parquet/src/arrow/arrow_writer.rs b/parquet/src/arrow/arrow_writer.rs index 29bb54fa0666..8600eb0b5101 100644 --- a/parquet/src/arrow/arrow_writer.rs +++ b/parquet/src/arrow/arrow_writer.rs @@ -710,7 +710,8 @@ mod tests { .add_buffer(a_value_offsets) .add_child_data(a_values.data().clone()) .null_bit_buffer(Buffer::from(vec![0b00011011])) - .build(); + .build() + .unwrap(); let a = ListArray::from(a_list_data); // build a record batch @@ -749,7 +750,8 @@ mod tests { .len(5) .add_buffer(a_value_offsets) .add_child_data(a_values.data().clone()) - .build(); + .build() + .unwrap(); let a = ListArray::from(a_list_data); // build a record batch @@ -870,7 +872,8 @@ mod tests { .len(5) .add_buffer(g_value_offsets.clone()) .add_child_data(g_value.data().clone()) - .build(); + .build() + .unwrap(); let g = ListArray::from(g_list_data); // The difference between g and h is that h has a null bitmap let h_list_data = ArrayData::builder(struct_field_h.data_type().clone()) @@ -878,7 +881,8 @@ mod tests { .add_buffer(g_value_offsets) .add_child_data(g_value.data().clone()) .null_bit_buffer(Buffer::from(vec![0b00011011])) - .build(); + .build() + .unwrap(); let h = ListArray::from(h_list_data); let e = StructArray::from(vec![ @@ -998,13 +1002,15 @@ mod tests { .len(6) .null_bit_buffer(Buffer::from(vec![0b00100111])) .add_child_data(c.data().clone()) - .build(); + .build() + .unwrap(); let b = StructArray::from(b_data); let a_data = ArrayDataBuilder::new(field_a.data_type().clone()) .len(6) .null_bit_buffer(Buffer::from(vec![0b00101111])) .add_child_data(b.data().clone()) - .build(); + .build() + .unwrap(); let a = StructArray::from(a_data); assert_eq!(a.null_count(), 1); @@ -1033,12 +1039,14 @@ mod tests { let b_data = ArrayDataBuilder::new(field_b.data_type().clone()) .len(6) .add_child_data(c.data().clone()) - .build(); + .build() + .unwrap(); let b = StructArray::from(b_data); let a_data = ArrayDataBuilder::new(field_a.data_type().clone()) .len(6) .add_child_data(b.data().clone()) - .build(); + .build() + .unwrap(); let a = StructArray::from(a_data); assert_eq!(a.null_count(), 0); @@ -1068,13 +1076,15 @@ mod tests { .len(6) .null_bit_buffer(Buffer::from(vec![0b00100111])) .add_child_data(c.data().clone()) - .build(); + .build() + .unwrap(); let b = StructArray::from(b_data); // a intentionally has no null buffer, to test that this is handled correctly let a_data = ArrayDataBuilder::new(field_a.data_type().clone()) .len(6) .add_child_data(b.data().clone()) - .build(); + .build() + .unwrap(); let a = StructArray::from(a_data); assert_eq!(a.null_count(), 0); @@ -1525,7 +1535,8 @@ mod tests { .add_buffer(a_value_offsets) .null_bit_buffer(Buffer::from(vec![0b00011011])) .add_child_data(a_values.data().clone()) - .build(); + .build() + .unwrap(); assert_eq!(a_list_data.null_count(), 1); @@ -1549,7 +1560,8 @@ mod tests { .add_buffer(a_value_offsets) .add_child_data(a_values.data().clone()) .null_bit_buffer(Buffer::from(vec![0b00011011])) - .build(); + .build() + .unwrap(); // I think this setup is incorrect because this should pass assert_eq!(a_list_data.null_count(), 1); diff --git a/parquet/src/arrow/levels.rs b/parquet/src/arrow/levels.rs index 3be315b71e69..c9b6052aeb87 100644 --- a/parquet/src/arrow/levels.rs +++ b/parquet/src/arrow/levels.rs @@ -1304,7 +1304,8 @@ mod tests { .add_buffer(a_value_offsets) .null_bit_buffer(Buffer::from(vec![0b00011011])) .add_child_data(a_values.data().clone()) - .build(); + .build() + .unwrap(); assert_eq!(a_list_data.null_count(), 1); @@ -1407,7 +1408,8 @@ mod tests { .len(5) .add_buffer(g_value_offsets) .add_child_data(g_value.data().clone()) - .build(); + .build() + .unwrap(); let g = ListArray::from(g_list_data); let e = StructArray::from(vec![ From 8afbf5357575a548a536a5c7fe7c910b56c17470 Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Sat, 9 Oct 2021 12:16:07 -0400 Subject: [PATCH 2/3] Fix compile for simd --- arrow/src/compute/kernels/arithmetic.rs | 164 +++++++++++++----------- arrow/src/compute/kernels/comparison.rs | 40 +++--- 2 files changed, 111 insertions(+), 93 deletions(-) diff --git a/arrow/src/compute/kernels/arithmetic.rs b/arrow/src/compute/kernels/arithmetic.rs index a15a9b306f0a..f92888b37965 100644 --- a/arrow/src/compute/kernels/arithmetic.rs +++ b/arrow/src/compute/kernels/arithmetic.rs @@ -78,18 +78,20 @@ where }, ); - let data = ArrayData::new( - T::DATA_TYPE, - array.len(), - None, - array - .data_ref() - .null_buffer() - .map(|b| b.bit_slice(array.offset(), array.len())), - 0, - vec![result.into()], - vec![], - ); + let data = unsafe { + ArrayData::new_unchecked( + T::DATA_TYPE, + array.len(), + None, + array + .data_ref() + .null_buffer() + .map(|b| b.bit_slice(array.offset(), array.len())), + 0, + vec![result.into()], + vec![], + ) + }; Ok(PrimitiveArray::::from(data)) } @@ -130,18 +132,20 @@ where }, ); - let data = ArrayData::new( - T::DATA_TYPE, - array.len(), - None, - array - .data_ref() - .null_buffer() - .map(|b| b.bit_slice(array.offset(), array.len())), - 0, - vec![result.into()], - vec![], - ); + let data = unsafe { + ArrayData::new_unchecked( + T::DATA_TYPE, + array.len(), + None, + array + .data_ref() + .null_buffer() + .map(|b| b.bit_slice(array.offset(), array.len())), + 0, + vec![result.into()], + vec![], + ) + }; Ok(PrimitiveArray::::from(data)) } @@ -419,15 +423,17 @@ where *scalar_result = scalar_op(*scalar_left, *scalar_right); }); - let data = ArrayData::new( - T::DATA_TYPE, - left.len(), - None, - null_bit_buffer, - 0, - vec![result.into()], - vec![], - ); + let data = unsafe { + ArrayData::new_unchecked( + T::DATA_TYPE, + left.len(), + None, + null_bit_buffer, + 0, + vec![result.into()], + vec![], + ) + }; Ok(PrimitiveArray::::from(data)) } @@ -731,15 +737,17 @@ where } } - let data = ArrayData::new( - T::DATA_TYPE, - left.len(), - None, - null_bit_buffer, - 0, - vec![result.into()], - vec![], - ); + let data = unsafe { + ArrayData::new_unchecked( + T::DATA_TYPE, + left.len(), + None, + null_bit_buffer, + 0, + vec![result.into()], + vec![], + ) + }; Ok(PrimitiveArray::::from(data)) } @@ -851,15 +859,17 @@ where } } - let data = ArrayData::new( - T::DATA_TYPE, - left.len(), - None, - null_bit_buffer, - 0, - vec![result.into()], - vec![], - ); + let data = unsafe { + ArrayData::new_unchecked( + T::DATA_TYPE, + left.len(), + None, + null_bit_buffer, + 0, + vec![result.into()], + vec![], + ) + }; Ok(PrimitiveArray::::from(data)) } @@ -897,18 +907,20 @@ where simd_checked_modulus_scalar_remainder::(array_chunks, modulo, result_chunks)?; - let data = ArrayData::new( - T::DATA_TYPE, - array.len(), - None, - array - .data_ref() - .null_buffer() - .map(|b| b.bit_slice(array.offset(), array.len())), - 0, - vec![result.into()], - vec![], - ); + let data = unsafe { + ArrayData::new_unchecked( + T::DATA_TYPE, + array.len(), + None, + array + .data_ref() + .null_buffer() + .map(|b| b.bit_slice(array.offset(), array.len())), + 0, + vec![result.into()], + vec![], + ) + }; Ok(PrimitiveArray::::from(data)) } @@ -946,18 +958,20 @@ where simd_checked_divide_scalar_remainder::(array_chunks, divisor, result_chunks)?; - let data = ArrayData::new( - T::DATA_TYPE, - array.len(), - None, - array - .data_ref() - .null_buffer() - .map(|b| b.bit_slice(array.offset(), array.len())), - 0, - vec![result.into()], - vec![], - ); + let data = unsafe { + ArrayData::new_unchecked( + T::DATA_TYPE, + array.len(), + None, + array + .data_ref() + .null_buffer() + .map(|b| b.bit_slice(array.offset(), array.len())), + 0, + vec![result.into()], + vec![], + ) + }; Ok(PrimitiveArray::::from(data)) } diff --git a/arrow/src/compute/kernels/comparison.rs b/arrow/src/compute/kernels/comparison.rs index 60e275fe09e1..f246b24372aa 100644 --- a/arrow/src/compute/kernels/comparison.rs +++ b/arrow/src/compute/kernels/comparison.rs @@ -781,15 +781,17 @@ where &remainder_bitmask.to_le_bytes()[0..bit_util::ceil(left_remainder.len(), 8)]; result_remainder.copy_from_slice(remainder_mask_as_bytes); - let data = ArrayData::new( - DataType::Boolean, - len, - None, - null_bit_buffer, - 0, - vec![result.into()], - vec![], - ); + let data = unsafe { + ArrayData::new_unchecked( + DataType::Boolean, + len, + None, + null_bit_buffer, + 0, + vec![result.into()], + vec![], + ) + }; Ok(BooleanArray::from(data)) } @@ -864,15 +866,17 @@ where // null count is the same as in the input since the right side of the scalar comparison cannot be null let null_count = left.null_count(); - let data = ArrayData::new( - DataType::Boolean, - len, - Some(null_count), - null_bit_buffer, - 0, - vec![result.into()], - vec![], - ); + let data = unsafe { + ArrayData::new_unchecked( + DataType::Boolean, + len, + Some(null_count), + null_bit_buffer, + 0, + vec![result.into()], + vec![], + ) + }; Ok(BooleanArray::from(data)) } From f85fff63bba6b654d93d91e75693f867e195b4ad Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Sun, 10 Oct 2021 06:16:19 -0400 Subject: [PATCH 3/3] remove unsafe in benches --- arrow/benches/array_from_vec.rs | 14 ++------------ 1 file changed, 2 insertions(+), 12 deletions(-) diff --git a/arrow/benches/array_from_vec.rs b/arrow/benches/array_from_vec.rs index a589ee9ce8a5..3f82beb6f534 100644 --- a/arrow/benches/array_from_vec.rs +++ b/arrow/benches/array_from_vec.rs @@ -22,21 +22,11 @@ use criterion::Criterion; extern crate arrow; use arrow::array::*; -use arrow::buffer::Buffer; -use arrow::datatypes::*; use std::{convert::TryFrom, sync::Arc}; fn array_from_vec(n: usize) { - let mut v: Vec = Vec::with_capacity(n); - for i in 0..n { - v.push((i & 0xffff) as u8); - } - let arr_data = unsafe { - ArrayDataBuilder::new(DataType::Int32) - .add_buffer(Buffer::from(v)) - .build_unchecked() - }; - criterion::black_box(Int32Array::from(arr_data)); + let v: Vec = (0..n as i32).collect(); + criterion::black_box(Int32Array::from(v)); } fn array_string_from_vec(n: usize) {