Skip to content

Commit

Permalink
Fix bugs in the from_list function. (#2277)
Browse files Browse the repository at this point in the history
* update binary_from_list

Signed-off-by: remzi <[email protected]>

* fix binary from list

Signed-off-by: remzi <[email protected]>

* fix decimal from fixed list

Signed-off-by: remzi <[email protected]>

* fix fixed binary from fixed list

Signed-off-by: remzi <[email protected]>

* fix string from list

Signed-off-by: remzi <[email protected]>

* add child length check

Signed-off-by: remzi <[email protected]>

* clean the code

Signed-off-by: remzi <[email protected]>
  • Loading branch information
HaoYang670 authored Aug 2, 2022
1 parent bde749e commit 58dc611
Show file tree
Hide file tree
Showing 4 changed files with 417 additions and 62 deletions.
165 changes: 114 additions & 51 deletions arrow/src/array/array_binary.rs
Original file line number Diff line number Diff line change
Expand Up @@ -134,21 +134,35 @@ impl<OffsetSize: OffsetSizeTrait> GenericBinaryArray<OffsetSize> {

fn from_list(v: GenericListArray<OffsetSize>) -> Self {
assert_eq!(
v.data_ref().child_data()[0].child_data().len(),
v.data_ref().child_data().len(),
1,
"BinaryArray can only be created from list array of u8 values \
(i.e. List<PrimitiveArray<u8>>)."
);
let child_data = &v.data_ref().child_data()[0];

assert_eq!(
child_data.child_data().len(),
0,
"BinaryArray can only be created from list array of u8 values \
(i.e. List<PrimitiveArray<u8>>)."
);
assert_eq!(
v.data_ref().child_data()[0].data_type(),
child_data.data_type(),
&DataType::UInt8,
"BinaryArray can only be created from List<u8> arrays, mismatched data types."
);
assert_eq!(
child_data.null_count(),
0,
"The child array cannot contain null values."
);

let builder = ArrayData::builder(Self::get_data_type())
.len(v.len())
.offset(v.offset())
.add_buffer(v.data_ref().buffers()[0].clone())
.add_buffer(v.data_ref().child_data()[0].buffers()[0].clone())
.add_buffer(child_data.buffers()[0].slice(child_data.offset()))
.null_bit_buffer(v.data_ref().null_buffer().cloned());

let data = unsafe { builder.build_unchecked() };
Expand Down Expand Up @@ -441,10 +455,7 @@ pub type LargeBinaryArray = GenericBinaryArray<i64>;
#[cfg(test)]
mod tests {
use super::*;
use crate::{
array::{LargeListArray, ListArray},
datatypes::Field,
};
use crate::{array::ListArray, datatypes::Field};

#[test]
fn test_binary_array() {
Expand Down Expand Up @@ -577,37 +588,38 @@ mod tests {
assert_eq!(7, binary_array.value_length(1));
}

#[test]
fn test_binary_array_from_list_array() {
let values: [u8; 12] = [
b'h', b'e', b'l', b'l', b'o', b'p', b'a', b'r', b'q', b'u', b'e', b't',
];
let values_data = ArrayData::builder(DataType::UInt8)
fn _test_generic_binary_array_from_list_array<O: OffsetSizeTrait>() {
let values = b"helloparquet";
let child_data = ArrayData::builder(DataType::UInt8)
.len(12)
.add_buffer(Buffer::from(&values[..]))
.build()
.unwrap();
let offsets: [i32; 4] = [0, 5, 5, 12];
let offsets = [0, 5, 5, 12].map(|n| O::from_usize(n).unwrap());

// Array data: ["hello", "", "parquet"]
let array_data1 = ArrayData::builder(DataType::Binary)
let array_data1 = ArrayData::builder(GenericBinaryArray::<O>::get_data_type())
.len(3)
.add_buffer(Buffer::from_slice_ref(&offsets))
.add_buffer(Buffer::from_slice_ref(&values))
.build()
.unwrap();
let binary_array1 = BinaryArray::from(array_data1);
let binary_array1 = GenericBinaryArray::<O>::from(array_data1);

let data_type = if O::IS_LARGE {
DataType::LargeList
} else {
DataType::List
}(Box::new(Field::new("item", DataType::UInt8, false)));

let data_type =
DataType::List(Box::new(Field::new("item", DataType::UInt8, false)));
let array_data2 = ArrayData::builder(data_type)
.len(3)
.add_buffer(Buffer::from_slice_ref(&offsets))
.add_child_data(values_data)
.add_child_data(child_data)
.build()
.unwrap();
let list_array = ListArray::from(array_data2);
let binary_array2 = BinaryArray::from(list_array);
let list_array = GenericListArray::<O>::from(array_data2);
let binary_array2 = GenericBinaryArray::<O>::from(list_array);

assert_eq!(2, binary_array2.data().buffers().len());
assert_eq!(0, binary_array2.data().child_data().len());
Expand All @@ -624,51 +636,102 @@ mod tests {
}
}

#[test]
fn test_binary_array_from_list_array() {
_test_generic_binary_array_from_list_array::<i32>();
}

#[test]
fn test_large_binary_array_from_list_array() {
let values: [u8; 12] = [
b'h', b'e', b'l', b'l', b'o', b'p', b'a', b'r', b'q', b'u', b'e', b't',
];
let values_data = ArrayData::builder(DataType::UInt8)
.len(12)
_test_generic_binary_array_from_list_array::<i64>();
}

fn _test_generic_binary_array_from_list_array_with_offset<O: OffsetSizeTrait>() {
let values = b"HelloArrowAndParquet";
// b"ArrowAndParquet"
let child_data = ArrayData::builder(DataType::UInt8)
.len(15)
.offset(5)
.add_buffer(Buffer::from(&values[..]))
.build()
.unwrap();
let offsets: [i64; 4] = [0, 5, 5, 12];

// Array data: ["hello", "", "parquet"]
let array_data1 = ArrayData::builder(DataType::LargeBinary)
.len(3)
let offsets = [0, 5, 8, 15].map(|n| O::from_usize(n).unwrap());
let null_buffer = Buffer::from_slice_ref(&[0b101]);
let data_type = if O::IS_LARGE {
DataType::LargeList
} else {
DataType::List
}(Box::new(Field::new("item", DataType::UInt8, false)));

// [None, Some(b"Parquet")]
let array_data = ArrayData::builder(data_type)
.len(2)
.offset(1)
.add_buffer(Buffer::from_slice_ref(&offsets))
.add_buffer(Buffer::from_slice_ref(&values))
.null_bit_buffer(Some(null_buffer))
.add_child_data(child_data)
.build()
.unwrap();
let binary_array1 = LargeBinaryArray::from(array_data1);
let list_array = GenericListArray::<O>::from(array_data);
let binary_array = GenericBinaryArray::<O>::from(list_array);

let data_type =
DataType::LargeList(Box::new(Field::new("item", DataType::UInt8, false)));
let array_data2 = ArrayData::builder(data_type)
.len(3)
assert_eq!(2, binary_array.len());
assert_eq!(1, binary_array.null_count());
assert!(binary_array.is_null(0));
assert!(binary_array.is_valid(1));
assert_eq!(b"Parquet", binary_array.value(1));
}

#[test]
fn test_binary_array_from_list_array_with_offset() {
_test_generic_binary_array_from_list_array_with_offset::<i32>();
}

#[test]
fn test_large_binary_array_from_list_array_with_offset() {
_test_generic_binary_array_from_list_array_with_offset::<i64>();
}

fn _test_generic_binary_array_from_list_array_with_child_nulls_failed<
O: OffsetSizeTrait,
>() {
let values = b"HelloArrow";
let child_data = ArrayData::builder(DataType::UInt8)
.len(10)
.add_buffer(Buffer::from(&values[..]))
.null_bit_buffer(Some(Buffer::from_slice_ref(&[0b1010101010])))
.build()
.unwrap();

let offsets = [0, 5, 10].map(|n| O::from_usize(n).unwrap());
let data_type = if O::IS_LARGE {
DataType::LargeList
} else {
DataType::List
}(Box::new(Field::new("item", DataType::UInt8, false)));

// [None, Some(b"Parquet")]
let array_data = ArrayData::builder(data_type)
.len(2)
.add_buffer(Buffer::from_slice_ref(&offsets))
.add_child_data(values_data)
.add_child_data(child_data)
.build()
.unwrap();
let list_array = LargeListArray::from(array_data2);
let binary_array2 = LargeBinaryArray::from(list_array);
let list_array = GenericListArray::<O>::from(array_data);
drop(GenericBinaryArray::<O>::from(list_array));
}

assert_eq!(2, binary_array2.data().buffers().len());
assert_eq!(0, binary_array2.data().child_data().len());
#[test]
#[should_panic(expected = "The child array cannot contain null values.")]
fn test_binary_array_from_list_array_with_child_nulls_failed() {
_test_generic_binary_array_from_list_array_with_child_nulls_failed::<i32>();
}

assert_eq!(binary_array1.len(), binary_array2.len());
assert_eq!(binary_array1.null_count(), binary_array2.null_count());
assert_eq!(binary_array1.value_offsets(), binary_array2.value_offsets());
for i in 0..binary_array1.len() {
assert_eq!(binary_array1.value(i), binary_array2.value(i));
assert_eq!(binary_array1.value(i), unsafe {
binary_array2.value_unchecked(i)
});
assert_eq!(binary_array1.value_length(i), binary_array2.value_length(i));
}
#[test]
#[should_panic(expected = "The child array cannot contain null values.")]
fn test_large_binary_array_from_list_array_with_child_nulls_failed() {
_test_generic_binary_array_from_list_array_with_child_nulls_failed::<i64>();
}

fn test_generic_binary_array_from_opt_vec<T: OffsetSizeTrait>() {
Expand Down
80 changes: 77 additions & 3 deletions arrow/src/array/array_decimal.rs
Original file line number Diff line number Diff line change
Expand Up @@ -187,24 +187,42 @@ pub trait BasicDecimalArray<T: BasicDecimal, U: From<ArrayData>>:
/// Build a decimal array from [`FixedSizeListArray`].
///
/// NB: This function does not validate that each value is in the permissible
/// range for a decimal. And, the null buffer of the child array will be ignored.
/// range for a decimal.
#[deprecated(note = "please use `from_fixed_size_binary_array` instead")]
fn from_fixed_size_list_array(
v: FixedSizeListArray,
precision: usize,
scale: usize,
) -> U {
assert_eq!(
v.data_ref().child_data().len(),
1,
"DecimalArray can only be created from list array of u8 values \
(i.e. FixedSizeList<PrimitiveArray<u8>>)."
);
let child_data = &v.data_ref().child_data()[0];

assert_eq!(
child_data.child_data().len(),
0,
"Decimal128Array can only be created from list array of u8 values \
"DecimalArray can only be created from list array of u8 values \
(i.e. FixedSizeList<PrimitiveArray<u8>>)."
);
assert_eq!(
child_data.data_type(),
&DataType::UInt8,
"Decimal128Array can only be created from FixedSizeList<u8> arrays, mismatched data types."
"DecimalArray can only be created from FixedSizeList<u8> arrays, mismatched data types."
);
assert!(
v.value_length() == Self::VALUE_LENGTH,
"Value length of the array ({}) must equal to the byte width of the decimal ({})",
v.value_length(),
Self::VALUE_LENGTH,
);
assert_eq!(
v.data_ref().child_data()[0].null_count(),
0,
"The child array cannot contain null values."
);

let list_offset = v.offset();
Expand Down Expand Up @@ -841,6 +859,62 @@ mod tests {
assert_eq!(decimal.value_as_string(1), "56".to_string());
}

#[test]
#[allow(deprecated)]
#[should_panic(expected = "The child array cannot contain null values.")]
fn test_decimal_array_from_fixed_size_list_with_child_nulls_failed() {
let value_data = ArrayData::builder(DataType::UInt8)
.len(16)
.add_buffer(Buffer::from_slice_ref(&[12_i128]))
.null_bit_buffer(Some(Buffer::from_slice_ref(&[0b1010101010101010])))
.build()
.unwrap();

// Construct a list array from the above two
let list_data_type = DataType::FixedSizeList(
Box::new(Field::new("item", DataType::UInt8, false)),
16,
);
let list_data = ArrayData::builder(list_data_type)
.len(1)
.add_child_data(value_data)
.build()
.unwrap();
let list_array = FixedSizeListArray::from(list_data);
drop(Decimal128Array::from_fixed_size_list_array(
list_array, 38, 0,
));
}

#[test]
#[allow(deprecated)]
#[should_panic(
expected = "Value length of the array (8) must equal to the byte width of the decimal (16)"
)]
fn test_decimal_array_from_fixed_size_list_with_wrong_length() {
let value_data = ArrayData::builder(DataType::UInt8)
.len(16)
.add_buffer(Buffer::from_slice_ref(&[12_i128]))
.null_bit_buffer(Some(Buffer::from_slice_ref(&[0b1010101010101010])))
.build()
.unwrap();

// Construct a list array from the above two
let list_data_type = DataType::FixedSizeList(
Box::new(Field::new("item", DataType::UInt8, false)),
8,
);
let list_data = ArrayData::builder(list_data_type)
.len(2)
.add_child_data(value_data)
.build()
.unwrap();
let list_array = FixedSizeListArray::from(list_data);
drop(Decimal128Array::from_fixed_size_list_array(
list_array, 38, 0,
));
}

#[test]
fn test_decimal256_iter() {
let mut builder = Decimal256Builder::new(30, 76, 6);
Expand Down
Loading

0 comments on commit 58dc611

Please sign in to comment.