diff --git a/arrow/src/array/array_binary.rs b/arrow/src/array/array_binary.rs index 5e0651078bde..3e49f60318ce 100644 --- a/arrow/src/array/array_binary.rs +++ b/arrow/src/array/array_binary.rs @@ -134,21 +134,35 @@ impl GenericBinaryArray { fn from_list(v: GenericListArray) -> Self { assert_eq!( - v.data_ref().child_data()[0].child_data().len(), + v.data_ref().child_data().len(), + 1, + "BinaryArray can only be created from list array of u8 values \ + (i.e. List>)." + ); + let child_data = &v.data_ref().child_data()[0]; + + assert_eq!( + child_data.child_data().len(), 0, "BinaryArray can only be created from list array of u8 values \ (i.e. List>)." ); assert_eq!( - v.data_ref().child_data()[0].data_type(), + child_data.data_type(), &DataType::UInt8, "BinaryArray can only be created from List arrays, mismatched data types." ); + assert_eq!( + child_data.null_count(), + 0, + "The child array cannot contain null values." + ); let builder = ArrayData::builder(Self::get_data_type()) .len(v.len()) + .offset(v.offset()) .add_buffer(v.data_ref().buffers()[0].clone()) - .add_buffer(v.data_ref().child_data()[0].buffers()[0].clone()) + .add_buffer(child_data.buffers()[0].slice(child_data.offset())) .null_bit_buffer(v.data_ref().null_buffer().cloned()); let data = unsafe { builder.build_unchecked() }; @@ -441,10 +455,7 @@ pub type LargeBinaryArray = GenericBinaryArray; #[cfg(test)] mod tests { use super::*; - use crate::{ - array::{LargeListArray, ListArray}, - datatypes::Field, - }; + use crate::{array::ListArray, datatypes::Field}; #[test] fn test_binary_array() { @@ -577,37 +588,38 @@ mod tests { assert_eq!(7, binary_array.value_length(1)); } - #[test] - fn test_binary_array_from_list_array() { - let values: [u8; 12] = [ - b'h', b'e', b'l', b'l', b'o', b'p', b'a', b'r', b'q', b'u', b'e', b't', - ]; - let values_data = ArrayData::builder(DataType::UInt8) + fn _test_generic_binary_array_from_list_array() { + let values = b"helloparquet"; + let child_data = ArrayData::builder(DataType::UInt8) .len(12) .add_buffer(Buffer::from(&values[..])) .build() .unwrap(); - let offsets: [i32; 4] = [0, 5, 5, 12]; + let offsets = [0, 5, 5, 12].map(|n| O::from_usize(n).unwrap()); // Array data: ["hello", "", "parquet"] - let array_data1 = ArrayData::builder(DataType::Binary) + let array_data1 = ArrayData::builder(GenericBinaryArray::::get_data_type()) .len(3) .add_buffer(Buffer::from_slice_ref(&offsets)) .add_buffer(Buffer::from_slice_ref(&values)) .build() .unwrap(); - let binary_array1 = BinaryArray::from(array_data1); + let binary_array1 = GenericBinaryArray::::from(array_data1); + + let data_type = if O::IS_LARGE { + DataType::LargeList + } else { + DataType::List + }(Box::new(Field::new("item", DataType::UInt8, false))); - let data_type = - DataType::List(Box::new(Field::new("item", DataType::UInt8, false))); let array_data2 = ArrayData::builder(data_type) .len(3) .add_buffer(Buffer::from_slice_ref(&offsets)) - .add_child_data(values_data) + .add_child_data(child_data) .build() .unwrap(); - let list_array = ListArray::from(array_data2); - let binary_array2 = BinaryArray::from(list_array); + let list_array = GenericListArray::::from(array_data2); + let binary_array2 = GenericBinaryArray::::from(list_array); assert_eq!(2, binary_array2.data().buffers().len()); assert_eq!(0, binary_array2.data().child_data().len()); @@ -624,51 +636,102 @@ mod tests { } } + #[test] + fn test_binary_array_from_list_array() { + _test_generic_binary_array_from_list_array::(); + } + #[test] fn test_large_binary_array_from_list_array() { - let values: [u8; 12] = [ - b'h', b'e', b'l', b'l', b'o', b'p', b'a', b'r', b'q', b'u', b'e', b't', - ]; - let values_data = ArrayData::builder(DataType::UInt8) - .len(12) + _test_generic_binary_array_from_list_array::(); + } + + fn _test_generic_binary_array_from_list_array_with_offset() { + let values = b"HelloArrowAndParquet"; + // b"ArrowAndParquet" + let child_data = ArrayData::builder(DataType::UInt8) + .len(15) + .offset(5) .add_buffer(Buffer::from(&values[..])) .build() .unwrap(); - let offsets: [i64; 4] = [0, 5, 5, 12]; - // Array data: ["hello", "", "parquet"] - let array_data1 = ArrayData::builder(DataType::LargeBinary) - .len(3) + let offsets = [0, 5, 8, 15].map(|n| O::from_usize(n).unwrap()); + let null_buffer = Buffer::from_slice_ref(&[0b101]); + let data_type = if O::IS_LARGE { + DataType::LargeList + } else { + DataType::List + }(Box::new(Field::new("item", DataType::UInt8, false))); + + // [None, Some(b"Parquet")] + let array_data = ArrayData::builder(data_type) + .len(2) + .offset(1) .add_buffer(Buffer::from_slice_ref(&offsets)) - .add_buffer(Buffer::from_slice_ref(&values)) + .null_bit_buffer(Some(null_buffer)) + .add_child_data(child_data) .build() .unwrap(); - let binary_array1 = LargeBinaryArray::from(array_data1); + let list_array = GenericListArray::::from(array_data); + let binary_array = GenericBinaryArray::::from(list_array); - let data_type = - DataType::LargeList(Box::new(Field::new("item", DataType::UInt8, false))); - let array_data2 = ArrayData::builder(data_type) - .len(3) + assert_eq!(2, binary_array.len()); + assert_eq!(1, binary_array.null_count()); + assert!(binary_array.is_null(0)); + assert!(binary_array.is_valid(1)); + assert_eq!(b"Parquet", binary_array.value(1)); + } + + #[test] + fn test_binary_array_from_list_array_with_offset() { + _test_generic_binary_array_from_list_array_with_offset::(); + } + + #[test] + fn test_large_binary_array_from_list_array_with_offset() { + _test_generic_binary_array_from_list_array_with_offset::(); + } + + fn _test_generic_binary_array_from_list_array_with_child_nulls_failed< + O: OffsetSizeTrait, + >() { + let values = b"HelloArrow"; + let child_data = ArrayData::builder(DataType::UInt8) + .len(10) + .add_buffer(Buffer::from(&values[..])) + .null_bit_buffer(Some(Buffer::from_slice_ref(&[0b1010101010]))) + .build() + .unwrap(); + + let offsets = [0, 5, 10].map(|n| O::from_usize(n).unwrap()); + let data_type = if O::IS_LARGE { + DataType::LargeList + } else { + DataType::List + }(Box::new(Field::new("item", DataType::UInt8, false))); + + // [None, Some(b"Parquet")] + let array_data = ArrayData::builder(data_type) + .len(2) .add_buffer(Buffer::from_slice_ref(&offsets)) - .add_child_data(values_data) + .add_child_data(child_data) .build() .unwrap(); - let list_array = LargeListArray::from(array_data2); - let binary_array2 = LargeBinaryArray::from(list_array); + let list_array = GenericListArray::::from(array_data); + drop(GenericBinaryArray::::from(list_array)); + } - assert_eq!(2, binary_array2.data().buffers().len()); - assert_eq!(0, binary_array2.data().child_data().len()); + #[test] + #[should_panic(expected = "The child array cannot contain null values.")] + fn test_binary_array_from_list_array_with_child_nulls_failed() { + _test_generic_binary_array_from_list_array_with_child_nulls_failed::(); + } - assert_eq!(binary_array1.len(), binary_array2.len()); - assert_eq!(binary_array1.null_count(), binary_array2.null_count()); - assert_eq!(binary_array1.value_offsets(), binary_array2.value_offsets()); - for i in 0..binary_array1.len() { - assert_eq!(binary_array1.value(i), binary_array2.value(i)); - assert_eq!(binary_array1.value(i), unsafe { - binary_array2.value_unchecked(i) - }); - assert_eq!(binary_array1.value_length(i), binary_array2.value_length(i)); - } + #[test] + #[should_panic(expected = "The child array cannot contain null values.")] + fn test_large_binary_array_from_list_array_with_child_nulls_failed() { + _test_generic_binary_array_from_list_array_with_child_nulls_failed::(); } fn test_generic_binary_array_from_opt_vec() { diff --git a/arrow/src/array/array_decimal.rs b/arrow/src/array/array_decimal.rs index 5ce407b0bc81..186d0a2f678a 100644 --- a/arrow/src/array/array_decimal.rs +++ b/arrow/src/array/array_decimal.rs @@ -187,24 +187,42 @@ pub trait BasicDecimalArray>: /// Build a decimal array from [`FixedSizeListArray`]. /// /// NB: This function does not validate that each value is in the permissible - /// range for a decimal. And, the null buffer of the child array will be ignored. + /// range for a decimal. #[deprecated(note = "please use `from_fixed_size_binary_array` instead")] fn from_fixed_size_list_array( v: FixedSizeListArray, precision: usize, scale: usize, ) -> U { + assert_eq!( + v.data_ref().child_data().len(), + 1, + "DecimalArray can only be created from list array of u8 values \ + (i.e. FixedSizeList>)." + ); let child_data = &v.data_ref().child_data()[0]; + assert_eq!( child_data.child_data().len(), 0, - "Decimal128Array can only be created from list array of u8 values \ + "DecimalArray can only be created from list array of u8 values \ (i.e. FixedSizeList>)." ); assert_eq!( child_data.data_type(), &DataType::UInt8, - "Decimal128Array can only be created from FixedSizeList arrays, mismatched data types." + "DecimalArray can only be created from FixedSizeList arrays, mismatched data types." + ); + assert!( + v.value_length() == Self::VALUE_LENGTH, + "Value length of the array ({}) must equal to the byte width of the decimal ({})", + v.value_length(), + Self::VALUE_LENGTH, + ); + assert_eq!( + v.data_ref().child_data()[0].null_count(), + 0, + "The child array cannot contain null values." ); let list_offset = v.offset(); @@ -841,6 +859,62 @@ mod tests { assert_eq!(decimal.value_as_string(1), "56".to_string()); } + #[test] + #[allow(deprecated)] + #[should_panic(expected = "The child array cannot contain null values.")] + fn test_decimal_array_from_fixed_size_list_with_child_nulls_failed() { + let value_data = ArrayData::builder(DataType::UInt8) + .len(16) + .add_buffer(Buffer::from_slice_ref(&[12_i128])) + .null_bit_buffer(Some(Buffer::from_slice_ref(&[0b1010101010101010]))) + .build() + .unwrap(); + + // Construct a list array from the above two + let list_data_type = DataType::FixedSizeList( + Box::new(Field::new("item", DataType::UInt8, false)), + 16, + ); + let list_data = ArrayData::builder(list_data_type) + .len(1) + .add_child_data(value_data) + .build() + .unwrap(); + let list_array = FixedSizeListArray::from(list_data); + drop(Decimal128Array::from_fixed_size_list_array( + list_array, 38, 0, + )); + } + + #[test] + #[allow(deprecated)] + #[should_panic( + expected = "Value length of the array (8) must equal to the byte width of the decimal (16)" + )] + fn test_decimal_array_from_fixed_size_list_with_wrong_length() { + let value_data = ArrayData::builder(DataType::UInt8) + .len(16) + .add_buffer(Buffer::from_slice_ref(&[12_i128])) + .null_bit_buffer(Some(Buffer::from_slice_ref(&[0b1010101010101010]))) + .build() + .unwrap(); + + // Construct a list array from the above two + let list_data_type = DataType::FixedSizeList( + Box::new(Field::new("item", DataType::UInt8, false)), + 8, + ); + let list_data = ArrayData::builder(list_data_type) + .len(2) + .add_child_data(value_data) + .build() + .unwrap(); + let list_array = FixedSizeListArray::from(list_data); + drop(Decimal128Array::from_fixed_size_list_array( + list_array, 38, 0, + )); + } + #[test] fn test_decimal256_iter() { let mut builder = Decimal256Builder::new(30, 76, 6); diff --git a/arrow/src/array/array_fixed_size_binary.rs b/arrow/src/array/array_fixed_size_binary.rs index e851fd3921b5..a811917c727c 100644 --- a/arrow/src/array/array_fixed_size_binary.rs +++ b/arrow/src/array/array_fixed_size_binary.rs @@ -291,20 +291,34 @@ impl From for ArrayData { impl From for FixedSizeBinaryArray { fn from(v: FixedSizeListArray) -> Self { assert_eq!( - v.data_ref().child_data()[0].child_data().len(), + v.data_ref().child_data().len(), + 1, + "FixedSizeBinaryArray can only be created from list array of u8 values \ + (i.e. FixedSizeList>)." + ); + let child_data = &v.data_ref().child_data()[0]; + + assert_eq!( + child_data.child_data().len(), 0, "FixedSizeBinaryArray can only be created from list array of u8 values \ (i.e. FixedSizeList>)." ); assert_eq!( - v.data_ref().child_data()[0].data_type(), + child_data.data_type(), &DataType::UInt8, "FixedSizeBinaryArray can only be created from FixedSizeList arrays, mismatched data types." ); + assert_eq!( + child_data.null_count(), + 0, + "The child array cannot contain null values." + ); let builder = ArrayData::builder(DataType::FixedSizeBinary(v.value_length())) .len(v.len()) - .add_buffer(v.data_ref().child_data()[0].buffers()[0].clone()) + .offset(v.offset()) + .add_buffer(child_data.buffers()[0].slice(child_data.offset())) .null_bit_buffer(v.data_ref().null_buffer().cloned()); let data = unsafe { builder.build_unchecked() }; @@ -412,6 +426,37 @@ mod tests { assert_eq!(10, fixed_size_binary_array.value_offset(1)); } + #[test] + fn test_fixed_size_binary_array_from_fixed_size_list_array() { + let values = [0_u8, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13]; + let values_data = ArrayData::builder(DataType::UInt8) + .len(12) + .offset(2) + .add_buffer(Buffer::from_slice_ref(&values)) + .build() + .unwrap(); + // [null, [10, 11, 12, 13]] + let array_data = unsafe { + ArrayData::builder(DataType::FixedSizeList( + Box::new(Field::new("item", DataType::UInt8, false)), + 4, + )) + .len(2) + .offset(1) + .add_child_data(values_data) + .null_bit_buffer(Some(Buffer::from_slice_ref(&[0b101]))) + .build_unchecked() + }; + let list_array = FixedSizeListArray::from(array_data); + let binary_array = FixedSizeBinaryArray::from(list_array); + + assert_eq!(2, binary_array.len()); + assert_eq!(1, binary_array.null_count()); + assert!(binary_array.is_null(0)); + assert!(binary_array.is_valid(1)); + assert_eq!(&[10, 11, 12, 13], binary_array.value(1)); + } + #[test] #[should_panic( expected = "FixedSizeBinaryArray can only be created from FixedSizeList arrays" @@ -419,7 +464,7 @@ mod tests { // Different error messages, so skip for now // https://github.com/apache/arrow-rs/issues/1545 #[cfg(not(feature = "force_validate"))] - fn test_fixed_size_binary_array_from_incorrect_list_array() { + fn test_fixed_size_binary_array_from_incorrect_fixed_size_list_array() { let values: [u32; 12] = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11]; let values_data = ArrayData::builder(DataType::UInt32) .len(12) @@ -440,6 +485,30 @@ mod tests { drop(FixedSizeBinaryArray::from(list_array)); } + #[test] + #[should_panic(expected = "The child array cannot contain null values.")] + fn test_fixed_size_binary_array_from_fixed_size_list_array_with_child_nulls_failed() { + let values = [0_u8, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11]; + let values_data = ArrayData::builder(DataType::UInt8) + .len(12) + .add_buffer(Buffer::from_slice_ref(&values)) + .null_bit_buffer(Some(Buffer::from_slice_ref(&[0b101010101010]))) + .build() + .unwrap(); + + let array_data = unsafe { + ArrayData::builder(DataType::FixedSizeList( + Box::new(Field::new("item", DataType::UInt8, false)), + 4, + )) + .len(3) + .add_child_data(values_data) + .build_unchecked() + }; + let list_array = FixedSizeListArray::from(array_data); + drop(FixedSizeBinaryArray::from(list_array)); + } + #[test] fn test_fixed_size_binary_array_fmt_debug() { let values: [u8; 15] = *b"hellotherearrow"; diff --git a/arrow/src/array/array_string.rs b/arrow/src/array/array_string.rs index 12a6b2f98b5f..c332aa197688 100644 --- a/arrow/src/array/array_string.rs +++ b/arrow/src/array/array_string.rs @@ -119,23 +119,40 @@ impl GenericStringArray { unsafe { self.value_unchecked(i) } } + /// Convert a list array to a string array. + /// This method is unsound because it does + /// not check the utf-8 validation for each element. fn from_list(v: GenericListArray) -> Self { assert_eq!( - v.data().child_data()[0].child_data().len(), + v.data_ref().child_data().len(), + 1, + "StringArray can only be created from list array of u8 values \ + (i.e. List>)." + ); + let child_data = &v.data_ref().child_data()[0]; + + assert_eq!( + child_data.child_data().len(), 0, "StringArray can only be created from list array of u8 values \ (i.e. List>)." ); assert_eq!( - v.data().child_data()[0].data_type(), + child_data.data_type(), &DataType::UInt8, "StringArray can only be created from List arrays, mismatched data types." ); + assert_eq!( + child_data.null_count(), + 0, + "The child array cannot contain null values." + ); let builder = ArrayData::builder(Self::get_data_type()) .len(v.len()) + .offset(v.offset()) .add_buffer(v.data().buffers()[0].clone()) - .add_buffer(v.data().child_data()[0].buffers()[0].clone()) + .add_buffer(child_data.buffers()[0].slice(child_data.offset())) .null_bit_buffer(v.data().null_buffer().cloned()); let array_data = unsafe { builder.build_unchecked() }; @@ -409,7 +426,10 @@ pub type LargeStringArray = GenericStringArray; #[cfg(test)] mod tests { - use crate::array::{ListBuilder, StringBuilder}; + use crate::{ + array::{ListBuilder, StringBuilder}, + datatypes::Field, + }; use super::*; @@ -675,4 +695,133 @@ mod tests { LargeStringArray::from_iter_values(BadIterator::new(3, 1, data.clone())); assert_eq!(expected, arr); } + + fn _test_generic_string_array_from_list_array() { + let values = b"HelloArrowAndParquet"; + // "ArrowAndParquet" + let child_data = ArrayData::builder(DataType::UInt8) + .len(15) + .offset(5) + .add_buffer(Buffer::from(&values[..])) + .build() + .unwrap(); + + let offsets = [0, 5, 8, 15].map(|n| O::from_usize(n).unwrap()); + let null_buffer = Buffer::from_slice_ref(&[0b101]); + let data_type = if O::IS_LARGE { + DataType::LargeList + } else { + DataType::List + }(Box::new(Field::new("item", DataType::UInt8, false))); + + // [None, Some("Parquet")] + let array_data = ArrayData::builder(data_type) + .len(2) + .offset(1) + .add_buffer(Buffer::from_slice_ref(&offsets)) + .null_bit_buffer(Some(null_buffer)) + .add_child_data(child_data) + .build() + .unwrap(); + let list_array = GenericListArray::::from(array_data); + let string_array = GenericStringArray::::from(list_array); + + assert_eq!(2, string_array.len()); + assert_eq!(1, string_array.null_count()); + assert!(string_array.is_null(0)); + assert!(string_array.is_valid(1)); + assert_eq!("Parquet", string_array.value(1)); + } + + #[test] + fn test_string_array_from_list_array() { + _test_generic_string_array_from_list_array::(); + } + + #[test] + fn test_large_string_array_from_list_array() { + _test_generic_string_array_from_list_array::(); + } + + fn _test_generic_string_array_from_list_array_with_child_nulls_failed< + O: OffsetSizeTrait, + >() { + let values = b"HelloArrow"; + let child_data = ArrayData::builder(DataType::UInt8) + .len(10) + .add_buffer(Buffer::from(&values[..])) + .null_bit_buffer(Some(Buffer::from_slice_ref(&[0b1010101010]))) + .build() + .unwrap(); + + let offsets = [0, 5, 10].map(|n| O::from_usize(n).unwrap()); + let data_type = if O::IS_LARGE { + DataType::LargeList + } else { + DataType::List + }(Box::new(Field::new("item", DataType::UInt8, false))); + + // [None, Some(b"Parquet")] + let array_data = ArrayData::builder(data_type) + .len(2) + .add_buffer(Buffer::from_slice_ref(&offsets)) + .add_child_data(child_data) + .build() + .unwrap(); + let list_array = GenericListArray::::from(array_data); + drop(GenericStringArray::::from(list_array)); + } + + #[test] + #[should_panic(expected = "The child array cannot contain null values.")] + fn test_stirng_array_from_list_array_with_child_nulls_failed() { + _test_generic_string_array_from_list_array_with_child_nulls_failed::(); + } + + #[test] + #[should_panic(expected = "The child array cannot contain null values.")] + fn test_large_string_array_from_list_array_with_child_nulls_failed() { + _test_generic_string_array_from_list_array_with_child_nulls_failed::(); + } + + fn _test_generic_string_array_from_list_array_wrong_type() { + let values = b"HelloArrow"; + let child_data = ArrayData::builder(DataType::UInt16) + .len(5) + .add_buffer(Buffer::from(&values[..])) + .build() + .unwrap(); + + let offsets = [0, 2, 3].map(|n| O::from_usize(n).unwrap()); + let data_type = if O::IS_LARGE { + DataType::LargeList + } else { + DataType::List + }(Box::new(Field::new("item", DataType::UInt16, false))); + + let array_data = ArrayData::builder(data_type) + .len(2) + .add_buffer(Buffer::from_slice_ref(&offsets)) + .add_child_data(child_data) + .build() + .unwrap(); + let list_array = GenericListArray::::from(array_data); + drop(GenericStringArray::::from(list_array)); + } + + #[test] + #[should_panic( + expected = "StringArray can only be created from List arrays, mismatched data types." + )] + fn test_string_array_from_list_array_wrong_type() { + _test_generic_string_array_from_list_array_wrong_type::(); + } + + #[test] + #[should_panic( + expected = "StringArray can only be created from List arrays, mismatched data types." + )] + fn test_large_string_array_from_list_array_wrong_type() { + _test_generic_string_array_from_list_array_wrong_type::(); + } }