From 20d81f5784e5609666fa3d0599f2f95e5e5a40c6 Mon Sep 17 00:00:00 2001 From: Max Burke Date: Sun, 13 Nov 2022 10:23:29 -0800 Subject: [PATCH] Add FixedSizeBinaryArray::try_from_sparse_iter_with_size (#3054) * implement ord for FixedSizeBinary types * add ord test * add compare for fixed size binary arrays * FixedSizeBinaryArray::try_from_sparse_iter has fails to generate a valid array when the iterator only produces None values * Tweak try_from_sparse_iter_with_size to take an Option; pass tests * simplify try_from_sparse_iter_with_size, make size parameter non-optional * add test for fixed size binary comparisons * move tests to use FixedSizeBinaryArray::from_sparse_iter_with_size, add docstring * format + fix failing tests * fix build --- .../src/array/fixed_size_binary_array.rs | 121 +++++++++++++++++- arrow-select/src/take.rs | 7 +- arrow/src/array/ffi.rs | 3 +- arrow/src/compute/kernels/comparison.rs | 41 ++++++ arrow/src/compute/kernels/sort.rs | 15 ++- arrow/src/compute/kernels/substring.rs | 3 +- arrow/src/ffi.rs | 8 +- arrow/src/util/bench_util.rs | 25 ++-- arrow/tests/array_transform.rs | 9 +- 9 files changed, 198 insertions(+), 34 deletions(-) diff --git a/arrow-array/src/array/fixed_size_binary_array.rs b/arrow-array/src/array/fixed_size_binary_array.rs index f37d1e3e5c38..9bac49810301 100644 --- a/arrow-array/src/array/fixed_size_binary_array.rs +++ b/arrow-array/src/array/fixed_size_binary_array.rs @@ -129,6 +129,9 @@ impl FixedSizeBinaryArray { /// # Errors /// /// Returns error if argument has length zero, or sizes of nested slices don't match. + #[deprecated( + note = "This function will fail if the iterator produces only None values; prefer `try_from_sparse_iter_with_size`" + )] pub fn try_from_sparse_iter(mut iter: T) -> Result where T: Iterator>, @@ -196,6 +199,86 @@ impl FixedSizeBinaryArray { Ok(FixedSizeBinaryArray::from(array_data)) } + /// Create an array from an iterable argument of sparse byte slices. + /// Sparsity means that items returned by the iterator are optional, i.e input argument can + /// contain `None` items. In cases where the iterator returns only `None` values, this + /// also takes a size parameter to ensure that the a valid FixedSizeBinaryArray is still + /// created. + /// + /// # Examples + /// + /// ``` + /// use arrow_array::FixedSizeBinaryArray; + /// let input_arg = vec![ + /// None, + /// Some(vec![7, 8]), + /// Some(vec![9, 10]), + /// None, + /// Some(vec![13, 14]), + /// None, + /// ]; + /// let array = FixedSizeBinaryArray::try_from_sparse_iter_with_size(input_arg.into_iter(), 2).unwrap(); + /// ``` + /// + /// # Errors + /// + /// Returns error if argument has length zero, or sizes of nested slices don't match. + pub fn try_from_sparse_iter_with_size( + mut iter: T, + size: i32, + ) -> Result + where + T: Iterator>, + U: AsRef<[u8]>, + { + let mut len = 0; + let mut byte = 0; + let mut null_buf = MutableBuffer::from_len_zeroed(0); + let mut buffer = MutableBuffer::from_len_zeroed(0); + + iter.try_for_each(|item| -> Result<(), ArrowError> { + // extend null bitmask by one byte per each 8 items + if byte == 0 { + null_buf.push(0u8); + byte = 8; + } + byte -= 1; + + if let Some(slice) = item { + let slice = slice.as_ref(); + if size as usize != slice.len() { + return Err(ArrowError::InvalidArgumentError(format!( + "Nested array size mismatch: one is {}, and the other is {}", + size, + slice.len() + ))); + } + + bit_util::set_bit(null_buf.as_slice_mut(), len); + buffer.extend_from_slice(slice); + } else { + buffer.extend_zeros(size as usize); + } + + len += 1; + + Ok(()) + })?; + + let array_data = unsafe { + ArrayData::new_unchecked( + DataType::FixedSizeBinary(size), + len, + None, + Some(null_buf.into()), + 0, + vec![buffer.into()], + vec![], + ) + }; + Ok(FixedSizeBinaryArray::from(array_data)) + } + /// Create an array from an iterable argument of byte slices. /// /// # Examples @@ -333,6 +416,7 @@ impl From for FixedSizeBinaryArray { impl From>> for FixedSizeBinaryArray { fn from(v: Vec>) -> Self { + #[allow(deprecated)] Self::try_from_sparse_iter(v.into_iter()).unwrap() } } @@ -561,6 +645,7 @@ mod tests { fn test_all_none_fixed_size_binary_array_from_sparse_iter() { let none_option: Option<[u8; 32]> = None; let input_arg = vec![none_option, none_option, none_option]; + #[allow(deprecated)] let arr = FixedSizeBinaryArray::try_from_sparse_iter(input_arg.into_iter()).unwrap(); assert_eq!(0, arr.value_length()); @@ -576,9 +661,31 @@ mod tests { None, Some(vec![13, 14]), ]; - let arr = - FixedSizeBinaryArray::try_from_sparse_iter(input_arg.into_iter()).unwrap(); + #[allow(deprecated)] + let arr = FixedSizeBinaryArray::try_from_sparse_iter(input_arg.iter().cloned()) + .unwrap(); assert_eq!(2, arr.value_length()); + assert_eq!(5, arr.len()); + + let arr = FixedSizeBinaryArray::try_from_sparse_iter_with_size( + input_arg.into_iter(), + 2, + ) + .unwrap(); + assert_eq!(2, arr.value_length()); + assert_eq!(5, arr.len()); + } + + #[test] + fn test_fixed_size_binary_array_from_sparse_iter_with_size_all_none() { + let input_arg = vec![None, None, None, None, None] as Vec>>; + + let arr = FixedSizeBinaryArray::try_from_sparse_iter_with_size( + input_arg.into_iter(), + 16, + ) + .unwrap(); + assert_eq!(16, arr.value_length()); assert_eq!(5, arr.len()) } @@ -643,7 +750,9 @@ mod tests { #[test] fn fixed_size_binary_array_all_null() { let data = vec![None] as Vec>; - let array = FixedSizeBinaryArray::try_from_sparse_iter(data.into_iter()).unwrap(); + let array = + FixedSizeBinaryArray::try_from_sparse_iter_with_size(data.into_iter(), 0) + .unwrap(); array .data() .validate_full() @@ -652,16 +761,14 @@ mod tests { #[test] // Test for https://github.com/apache/arrow-rs/issues/1390 - #[should_panic( - expected = "column types must match schema types, expected FixedSizeBinary(2) but found FixedSizeBinary(0) at column index 0" - )] fn fixed_size_binary_array_all_null_in_batch_with_schema() { let schema = Schema::new(vec![Field::new("a", DataType::FixedSizeBinary(2), true)]); let none_option: Option<[u8; 2]> = None; - let item = FixedSizeBinaryArray::try_from_sparse_iter( + let item = FixedSizeBinaryArray::try_from_sparse_iter_with_size( vec![none_option, none_option, none_option].into_iter(), + 2, ) .unwrap(); diff --git a/arrow-select/src/take.rs b/arrow-select/src/take.rs index d34a88ba53ce..4af876a79dcc 100644 --- a/arrow-select/src/take.rs +++ b/arrow-select/src/take.rs @@ -207,12 +207,12 @@ where DataType::LargeBinary => { Ok(Arc::new(take_bytes(as_generic_binary_array::(values), indices)?)) } - DataType::FixedSizeBinary(_) => { + DataType::FixedSizeBinary(size) => { let values = values .as_any() .downcast_ref::() .unwrap(); - Ok(Arc::new(take_fixed_size_binary(values, indices)?)) + Ok(Arc::new(take_fixed_size_binary(values, indices, *size)?)) } DataType::Null => { // Take applied to a null array produces a null array. @@ -769,6 +769,7 @@ where fn take_fixed_size_binary( values: &FixedSizeBinaryArray, indices: &PrimitiveArray, + size: i32, ) -> Result where IndexType: ArrowPrimitiveType, @@ -789,7 +790,7 @@ where .collect::, ArrowError>>()? .into_iter(); - FixedSizeBinaryArray::try_from_sparse_iter(array_iter) + FixedSizeBinaryArray::try_from_sparse_iter_with_size(array_iter, size) } /// `take` implementation for dictionary arrays diff --git a/arrow/src/array/ffi.rs b/arrow/src/array/ffi.rs index 72030f900a4e..a18f408a4566 100644 --- a/arrow/src/array/ffi.rs +++ b/arrow/src/array/ffi.rs @@ -212,7 +212,8 @@ mod tests { Some(vec![30, 30, 30]), None, ]; - let array = FixedSizeBinaryArray::try_from_sparse_iter(values.into_iter())?; + let array = + FixedSizeBinaryArray::try_from_sparse_iter_with_size(values.into_iter(), 3)?; let data = array.data(); test_round_trip(data) diff --git a/arrow/src/compute/kernels/comparison.rs b/arrow/src/compute/kernels/comparison.rs index a286eedd190b..4566b4969295 100644 --- a/arrow/src/compute/kernels/comparison.rs +++ b/arrow/src/compute/kernels/comparison.rs @@ -2270,6 +2270,18 @@ macro_rules! typed_compares { as_largestring_array($RIGHT), $OP, ), + (DataType::FixedSizeBinary(_), DataType::FixedSizeBinary(_)) => { + let lhs = $LEFT + .as_any() + .downcast_ref::() + .unwrap(); + let rhs = $RIGHT + .as_any() + .downcast_ref::() + .unwrap(); + + compare_op(lhs, rhs, $OP) + } (DataType::Binary, DataType::Binary) => compare_op( as_generic_binary_array::($LEFT), as_generic_binary_array::($RIGHT), @@ -5449,6 +5461,35 @@ mod tests { ); } + #[test] + fn test_eq_dyn_neq_dyn_fixed_size_binary() { + use crate::array::FixedSizeBinaryArray; + + let values1: Vec> = + vec![Some(&[0xfc, 0xa9]), None, Some(&[0x36, 0x01])]; + let values2: Vec> = + vec![Some(&[0xfc, 0xa9]), None, Some(&[0x36, 0x00])]; + + let array1 = + FixedSizeBinaryArray::try_from_sparse_iter_with_size(values1.into_iter(), 2) + .unwrap(); + let array2 = + FixedSizeBinaryArray::try_from_sparse_iter_with_size(values2.into_iter(), 2) + .unwrap(); + + let result = eq_dyn(&array1, &array2).unwrap(); + assert_eq!( + BooleanArray::from(vec![Some(true), None, Some(false)]), + result + ); + + let result = neq_dyn(&array1, &array2).unwrap(); + assert_eq!( + BooleanArray::from(vec![Some(false), None, Some(true)]), + result + ); + } + #[test] #[cfg(feature = "dyn_cmp_dict")] fn test_eq_dyn_neq_dyn_dictionary_i8_array() { diff --git a/arrow/src/compute/kernels/sort.rs b/arrow/src/compute/kernels/sort.rs index 97b0758e5dc7..81895760e588 100644 --- a/arrow/src/compute/kernels/sort.rs +++ b/arrow/src/compute/kernels/sort.rs @@ -1437,17 +1437,24 @@ mod tests { fixed_length: Option, ) { // Fixed size binary array - if fixed_length.is_some() { + if let Some(length) = fixed_length { let input = Arc::new( - FixedSizeBinaryArray::try_from_sparse_iter(data.iter().cloned()).unwrap(), + FixedSizeBinaryArray::try_from_sparse_iter_with_size( + data.iter().cloned(), + length, + ) + .unwrap(), ); let sorted = match limit { Some(_) => sort_limit(&(input as ArrayRef), options, limit).unwrap(), None => sort(&(input as ArrayRef), options).unwrap(), }; let expected = Arc::new( - FixedSizeBinaryArray::try_from_sparse_iter(expected_data.iter().cloned()) - .unwrap(), + FixedSizeBinaryArray::try_from_sparse_iter_with_size( + expected_data.iter().cloned(), + length, + ) + .unwrap(), ) as ArrayRef; assert_eq!(&sorted, &expected); diff --git a/arrow/src/compute/kernels/substring.rs b/arrow/src/compute/kernels/substring.rs index f52ddb3bc30b..76568ae0dac0 100644 --- a/arrow/src/compute/kernels/substring.rs +++ b/arrow/src/compute/kernels/substring.rs @@ -713,8 +713,9 @@ mod tests { .as_any() .downcast_ref::() .unwrap(); - let expected = FixedSizeBinaryArray::try_from_sparse_iter( + let expected = FixedSizeBinaryArray::try_from_sparse_iter_with_size( vec![None, Some(b"rrow")].into_iter(), + 4, ) .unwrap(); assert_eq!(result, &expected); diff --git a/arrow/src/ffi.rs b/arrow/src/ffi.rs index 95e6dce3c5fd..03c265318185 100644 --- a/arrow/src/ffi.rs +++ b/arrow/src/ffi.rs @@ -1231,7 +1231,8 @@ mod tests { Some(vec![30, 30, 30]), None, ]; - let array = FixedSizeBinaryArray::try_from_sparse_iter(values.into_iter())?; + let array = + FixedSizeBinaryArray::try_from_sparse_iter_with_size(values.into_iter(), 3)?; // export it let array = ArrowArray::try_from(array.into_data())?; @@ -1250,7 +1251,7 @@ mod tests { // verify assert_eq!( array, - &FixedSizeBinaryArray::try_from_sparse_iter( + &FixedSizeBinaryArray::try_from_sparse_iter_with_size( vec![ None, Some(vec![10, 10, 10]), @@ -1265,7 +1266,8 @@ mod tests { Some(vec![30, 30, 30]), None, ] - .into_iter() + .into_iter(), + 3 )? ); diff --git a/arrow/src/util/bench_util.rs b/arrow/src/util/bench_util.rs index d07443301c16..6420b6346feb 100644 --- a/arrow/src/util/bench_util.rs +++ b/arrow/src/util/bench_util.rs @@ -176,17 +176,20 @@ pub fn create_fsb_array( ) -> FixedSizeBinaryArray { let rng = &mut seedable_rng(); - FixedSizeBinaryArray::try_from_sparse_iter((0..size).map(|_| { - if rng.gen::() < null_density { - None - } else { - let value = rng - .sample_iter::(Standard) - .take(value_len) - .collect::>(); - Some(value) - } - })) + FixedSizeBinaryArray::try_from_sparse_iter_with_size( + (0..size).map(|_| { + if rng.gen::() < null_density { + None + } else { + let value = rng + .sample_iter::(Standard) + .take(value_len) + .collect::>(); + Some(value) + } + }), + value_len as i32, + ) .unwrap() } diff --git a/arrow/tests/array_transform.rs b/arrow/tests/array_transform.rs index 3619abacdc9d..03942be10e01 100644 --- a/arrow/tests/array_transform.rs +++ b/arrow/tests/array_transform.rs @@ -868,7 +868,7 @@ fn test_list_of_strings_append() { #[test] fn test_fixed_size_binary_append() { let a = vec![Some(vec![1, 2]), Some(vec![3, 4]), Some(vec![5, 6])]; - let a = FixedSizeBinaryArray::try_from_sparse_iter(a.into_iter()) + let a = FixedSizeBinaryArray::try_from_sparse_iter_with_size(a.into_iter(), 2) .expect("Failed to create FixedSizeBinaryArray from iterable"); let b = vec![ @@ -879,7 +879,7 @@ fn test_fixed_size_binary_append() { Some(vec![13, 14]), None, ]; - let b = FixedSizeBinaryArray::try_from_sparse_iter(b.into_iter()) + let b = FixedSizeBinaryArray::try_from_sparse_iter_with_size(b.into_iter(), 2) .expect("Failed to create FixedSizeBinaryArray from iterable"); let mut mutable = MutableArrayData::new(vec![a.data(), b.data()], false, 10); @@ -911,8 +911,9 @@ fn test_fixed_size_binary_append() { Some(vec![9, 10]), // b[4..4] ]; - let expected = FixedSizeBinaryArray::try_from_sparse_iter(expected.into_iter()) - .expect("Failed to create FixedSizeBinaryArray from iterable"); + let expected = + FixedSizeBinaryArray::try_from_sparse_iter_with_size(expected.into_iter(), 2) + .expect("Failed to create FixedSizeBinaryArray from iterable"); assert_eq!(&result, expected.data()); }